/**************************************************************************************
 * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
 *   "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
 *
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *    This product includes the software uAVS3d developed by
 *    Peking University Shenzhen Graduate School, Peng Cheng Laboratory
 *    and Guangdong Bohua UHD Innovation Corporation.
 * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
 *    Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
 *    names of its contributors may be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * For more information, contact us at rgwang@pkusz.edu.cn.
 **************************************************************************************/

#include "def_arm64.S"

#if defined(__arm64__)

//void uavs3d_intra_pred_ver_arm64(pel *src, pel *dst, int i_dst, int width, int height)
//src->x0, dst->x1, i_dst->x2, width->x3, height->x4
function uavs3d_intra_pred_ver_arm64

    //branch
    cmp w3, #16
    beq intra_pred_ver_w16
    bgt intra_pred_ver_w32x

    cmp w3, #8
    beq intra_pred_ver_w8

//intra_pred_ver_w4:

	ld1 {v0.s}[0], [x0]		    // load src[x]
intra_pred_ver_w4_y:
	st1 {v0.s}[0], [x1], x2     // store dst[x]
    st1 {v0.s}[0], [x1], x2
    subs w4, w4, #4
    st1 {v0.s}[0], [x1], x2
    st1 {v0.s}[0], [x1], x2
	bgt intra_pred_ver_w4_y

	b intra_pred_ver_end

intra_pred_ver_w8:

	ld1 {v0.8b}, [x0]		    // load src[x]
intra_pred_ver_w8_y:
	st1 {v0.8b}, [x1], x2       // store dst[x]
    st1 {v0.8b}, [x1], x2
    subs w4, w4, #4
    st1 {v0.8b}, [x1], x2
    st1 {v0.8b}, [x1], x2
	bgt intra_pred_ver_w8_y

	b intra_pred_ver_end

intra_pred_ver_w16:

	ld1 {v0.16b}, [x0]		    // load src[x]
intra_pred_ver_w16_y:
	st1 {v0.16b}, [x1], x2      // store dst[x]
    st1 {v0.16b}, [x1], x2
    subs w4, w4, #4
    st1 {v0.16b}, [x1], x2
    st1 {v0.16b}, [x1], x2
	bgt intra_pred_ver_w16_y

	b intra_pred_ver_end

intra_pred_ver_w32x:
    cmp w3, #64
    beq intra_pred_ver_w64
    bgt intra_pred_ver_w128

intra_pred_ver_w32:

	ld1 {v0.16b, v1.16b}, [x0]		// load src[x]
intra_pred_ver_w32_y:
	st1 {v0.16b, v1.16b}, [x1], x2  // store dst[x]
    st1 {v0.16b, v1.16b}, [x1], x2
    subs w4, w4, #4
    st1 {v0.16b, v1.16b}, [x1], x2
    st1 {v0.16b, v1.16b}, [x1], x2
	bgt intra_pred_ver_w32_y

	b intra_pred_ver_end

intra_pred_ver_w64:

    ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0]		// load src[x]
intra_pred_ver_w64_y:
    st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], x2  // store dst[x]
    st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], x2
    subs w4, w4, #4
    st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], x2
    st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], x2
    bgt intra_pred_ver_w64_y
    b   intra_pred_ver_end

intra_pred_ver_w128:

    sub x2, x2, #64
    ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0]      // load src[x]
    ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0]
intra_pred_ver_w128_y:
    st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64  // store dst[x]
    st1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], x2
    st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64
    st1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], x2
    st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64
    st1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], x2
    st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64
    st1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], x2
    subs w4, w4, #8
    st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64
    st1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], x2
    st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64
    st1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], x2
    st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64
    st1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], x2
    st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64
    st1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], x2
    bgt intra_pred_ver_w128_y

intra_pred_ver_end:

    ret


//void uavs3d_intra_pred_hor_arm64(pel *src, pel *dst, int i_dst, int width, int height)
//src->x0, dst->x1, i_dst->x2, width->x3, height->x4
function uavs3d_intra_pred_hor_arm64

	//branch
    cmp w3, #16
    beq intra_pred_hor_w16
    bgt intra_pred_hor_w32x

    cmp w3, #8
    beq intra_pred_hor_w8

//intra_pred_hor_w4:

    sub x0, x0, #3
intra_pred_hor_w4_y:
    ld1 {v4.s}[0], [x0]  		    // load src[-y]
	dup v0.8b, v4.b[3]
    dup v1.8b, v4.b[2]
    subs w4, w4, #4
    sub x0, x0, #4
    dup v2.8b, v4.b[1]
    dup v3.8b, v4.b[0]
	st1 {v0.s}[0], [x1], x2  		// store dst[x]
    st1 {v1.s}[0], [x1], x2
    st1 {v2.s}[0], [x1], x2
    st1 {v3.s}[0], [x1], x2
	bgt intra_pred_hor_w4_y

	b intra_pred_hor_end

intra_pred_hor_w8:
    sub x0, x0, #3
intra_pred_hor_w8_y:
    ld1 {v4.s}[0], [x0]             // load src[-y]
    dup v0.8b, v4.b[3]
    dup v1.8b, v4.b[2]
    sub x0, x0, #4
    subs w4, w4, #4
    dup v2.8b, v4.b[1]
    dup v3.8b, v4.b[0]
    st1 {v0.8b}, [x1], x2           // store dst[x]
    st1 {v1.8b}, [x1], x2
    st1 {v2.8b}, [x1], x2
    st1 {v3.8b}, [x1], x2
	bgt intra_pred_hor_w8_y

	b intra_pred_hor_end

intra_pred_hor_w16:
    sub x0, x0, #3
intra_pred_hor_w16_y:
    ld1 {v4.s}[0], [x0]             // load src[-y]
    dup v0.16b, v4.b[3]
    dup v1.16b, v4.b[2]
    subs w4, w4, #4
    sub x0, x0, #4
    dup v2.16b, v4.b[1]
    dup v3.16b, v4.b[0]
	st1 {v0.16b}, [x1], x2  	    // store dst[x]
    st1 {v1.16b}, [x1], x2
    st1 {v2.16b}, [x1], x2
    st1 {v3.16b}, [x1], x2
	bgt intra_pred_hor_w16_y

	b intra_pred_hor_end

intra_pred_hor_w32x:
    cmp w3, #64
    beq intra_pred_hor_w64
    bgt intra_pred_hor_w128

intra_pred_hor_w32:
    sub x0, x0, #3
intra_pred_hor_w32_y:
	ld1 {v16.s}[0], [x0]  			// load rpSrc[-y]
    dup v0.16b, v16.b[3]
    dup v2.16b, v16.b[2]
    dup v4.16b, v16.b[1]
    dup v6.16b, v16.b[0]
    mov v1.16b, v0.16b
    mov v3.16b, v2.16b
    mov v5.16b, v4.16b
    mov v7.16b, v6.16b
	st1 {v0.16b, v1.16b}, [x1], x2 // store dst[x]
    st1 {v2.16b, v3.16b}, [x1], x2
    sub x0, x0, #4
    subs w4, w4, #4
    st1 {v4.16b, v5.16b}, [x1], x2
    st1 {v6.16b, v7.16b}, [x1], x2
	bgt intra_pred_hor_w32_y

    b intra_pred_hor_end

intra_pred_hor_w64:
    sub x0, x0, #3
    sub x2, x2, #32
    intra_pred_hor_w64_y:
    ld1 {v16.s}[0], [x0]             // load src[-y]
    dup v0.16b, v16.b[3]
    dup v2.16b, v16.b[2]
    dup v4.16b, v16.b[1]
    dup v6.16b, v16.b[0]
    mov v1.16b, v0.16b
    mov v3.16b, v2.16b
    mov v5.16b, v4.16b
    mov v7.16b, v6.16b
    sub x0, x0, #4
    st1 {v0.16b, v1.16b}, [x1], #32 // store dst[x]
    st1 {v0.16b, v1.16b}, [x1], x2  // store dst[x]
    st1 {v2.16b, v3.16b}, [x1], #32
    st1 {v2.16b, v3.16b}, [x1], x2
    subs w4, w4, #4
    st1 {v4.16b, v5.16b}, [x1], #32
    st1 {v4.16b, v5.16b}, [x1], x2
    st1 {v6.16b, v7.16b}, [x1], #32
    st1 {v6.16b, v7.16b}, [x1], x2
    bne intra_pred_hor_w64_y
    b intra_pred_hor_end

intra_pred_hor_w128:
    sub x0, x0, #3
    sub x2, x2, #96
intra_pred_hor_w128_y:
    ld1 {v16.s}[0], [x0]             // load src[-y]
    dup v0.16b, v16.b[3]
    dup v2.16b, v16.b[2]
    dup v4.16b, v16.b[1]
    dup v6.16b, v16.b[0]
    mov v1.16b, v0.16b
    mov v3.16b, v2.16b
    mov v5.16b, v4.16b
    mov v7.16b, v6.16b
    sub x0, x0, #4
    st1 {v0.16b, v1.16b}, [x1], #32 // store dst[x]
    st1 {v0.16b, v1.16b}, [x1], #32
    st1 {v0.16b, v1.16b}, [x1], #32
    st1 {v0.16b, v1.16b}, [x1], x2
    st1 {v2.16b, v3.16b}, [x1], #32
    st1 {v2.16b, v3.16b}, [x1], #32
    st1 {v2.16b, v3.16b}, [x1], #32
    st1 {v2.16b, v3.16b}, [x1], x2
    subs w4, w4, #4
    st1 {v4.16b, v5.16b}, [x1], #32
    st1 {v4.16b, v5.16b}, [x1], #32
    st1 {v4.16b, v5.16b}, [x1], #32
    st1 {v4.16b, v5.16b}, [x1], x2
    st1 {v6.16b, v7.16b}, [x1], #32
    st1 {v6.16b, v7.16b}, [x1], #32
    st1 {v6.16b, v7.16b}, [x1], #32
    st1 {v6.16b, v7.16b}, [x1], x2
    bne intra_pred_hor_w128_y

intra_pred_hor_end:
    ret


//void uavs3d_intra_pred_dc_arm64(pel *src, pel *dst, int i_dst, int width, int height, int avail, int sample_bit_depth)
//src->x0, dst->x1, i_dst->x2, width->x3, height->x4, avail->x5, sample_bit_depth->x6
function uavs3d_intra_pred_dc_arm64

    and w7, w5, #2          // left avail
    and w8, w5, #1          // up avail
    lsr w7, w7, #1

    and w9, w7, w8
    cmp w9, #0
    bne intra_pred_dc_above_left

    cmp w8, #0
    bne intra_pred_dc_above

    cmp w7, #0
    beq intra_pred_dc_none

intra_pred_dc_left:
    sub x10, x0, x4
    mov w7, w4
    b intra_pred_dc_single_line
intra_pred_dc_above:
    add x10, x0, #1
    mov w7, w3

intra_pred_dc_single_line:
    cmp w7, #16
    beq intra_pred_dc_1ref_w16
    bgt intra_pred_dc_1ref_w32x

    cmp w7, #8
    beq intra_pred_dc_1ref_w8

//intra_pred_dc_1ref_w4:
    movi v0.8h, #0
    ld1 {v0.s}[0], [x10]

    uaddlp v0.4h, v0.8b
    addp v0.4h, v0.4h, v0.4h

    umov w8, v0.h[0]
    add w8, w8, #2
    lsr w8, w8, #2              // dc /= height;
    dup v0.16b, w8

    b intra_pred_dc_fillblock

intra_pred_dc_1ref_w8:
    movi v0.8h, #0
    ld1 {v0.8b}, [x10]
    uaddlp v0.4h, v0.8b
    addp v0.4h, v0.4h, v0.4h
    addp v0.4h, v0.4h, v0.4h

    umov w8, v0.h[0]
    add w8, w8, #4
    lsr w8, w8, #3              // dc /= height;
    dup v0.16b, w8

    b intra_pred_dc_fillblock

intra_pred_dc_1ref_w16:
    ld1 {v0.8b, v1.8b}, [x10]
    uaddl v0.8h, v0.8b, v1.8b
    addp v0.8h, v0.8h, v0.8h
    addp v0.4h, v0.4h, v0.4h
    addp v0.4h, v0.4h, v0.4h

    umov w8, v0.h[0]
    add w8, w8, #8
    lsr w8, w8, #4              // dc /= height;
    dup v0.16b, w8

    b intra_pred_dc_fillblock

intra_pred_dc_1ref_w32x:
    cmp w7, #64
    beq intra_pred_dc_1ref_w64
    bgt intra_pred_dc_1ref_w128

    ld1 {v0.16b, v1.16b}, [x10]
    uaddl  v2.8h, v0.8b, v1.8b
    uaddl2 v3.8h, v0.16b, v1.16b
    add  v0.8h, v2.8h, v3.8h
    addp v0.8h, v0.8h, v0.8h
    addp v0.4h, v0.4h, v0.4h
    addp v0.4h, v0.4h, v0.4h

    umov w8, v0.h[0]
    add w8, w8, #16
    lsr w8, w8, #5              // dc /= height;
    dup v0.16b, w8

    b intra_pred_dc_fillblock

intra_pred_dc_1ref_w64:
    ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x10]
    uaddl  v4.8h, v0.8b, v1.8b
    uaddl2 v5.8h, v0.16b, v1.16b
    uaddl  v6.8h, v2.8b, v3.8b
    uaddl2 v7.8h, v2.16b, v3.16b

    add  v4.8h, v4.8h, v5.8h
    add  v6.8h, v6.8h, v7.8h
    add  v0.8h, v4.8h, v6.8h

    addp v0.8h, v0.8h, v0.8h
    addp v0.4h, v0.4h, v0.4h
    addp v0.4h, v0.4h, v0.4h

    umov w8, v0.h[0]
    add w8, w8, #32
    lsr w8, w8, #6              // dc /= height;
    dup v0.16b, w8

    b intra_pred_dc_fillblock

intra_pred_dc_1ref_w128:
    ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x10], #64
    ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x10]
    uaddl  v16.8h, v0.8b, v1.8b
    uaddl2 v17.8h, v0.16b, v1.16b
    uaddl  v18.8h, v2.8b, v3.8b
    uaddl2 v19.8h, v2.16b, v3.16b
    uaddl  v20.8h, v4.8b, v5.8b
    uaddl2 v21.8h, v4.16b, v5.16b
    uaddl  v22.8h, v6.8b, v7.8b
    uaddl2 v23.8h, v6.16b, v7.16b

    add  v16.8h, v16.8h, v17.8h
    add  v18.8h, v18.8h, v19.8h
    add  v20.8h, v20.8h, v21.8h
    add  v22.8h, v22.8h, v23.8h

    add  v16.8h, v16.8h, v18.8h
    add  v20.8h, v20.8h, v22.8h
    add  v0.8h, v16.8h, v20.8h

    addp v0.8h, v0.8h, v0.8h
    addp v0.4h, v0.4h, v0.4h
    addp v0.4h, v0.4h, v0.4h

    umov w8, v0.h[0]
    add w8, w8, #64
    lsr w8, w8, #7              // dc /= height;
    dup v0.16b, w8

    b intra_pred_dc_fillblock

intra_pred_dc_none:

    movi v0.16b, #128			// iDCValue = 1 << (sample_bit_depth - 1);
    b intra_pred_dc_fillblock

intra_pred_dc_above_left:

    add x10, x0, #1 			// rpSrc = pSrc + 1;

    //branch
    cmp w3, #16
    beq intra_pred_dc_above_left_w16
    bgt intra_pred_dc_above_left_w32x

    cmp w3, #8
    beq intra_pred_dc_above_left_w8

//intra_pred_dc_above_left_w4:

    movi v0.8h, #0
    ld1 {v0.s}[0], [x10]
    uxtl v0.8h, v0.8b
    b intra_pred_dc_above_left_h

intra_pred_dc_above_left_w8:
    movi v0.8h, #0
    ld1 {v0.8b}, [x10]
    uxtl v0.8h, v0.8b
    b intra_pred_dc_above_left_h

intra_pred_dc_above_left_w16:
    ld1 {v0.8b, v1.8b}, [x10]
    uaddl v0.8h, v0.8b, v1.8b
    b intra_pred_dc_above_left_h

intra_pred_dc_above_left_w32x:
    cmp w3, #64
    beq intra_pred_dc_above_left_w64
    bgt intra_pred_dc_above_left_w128

    ld1 {v0.16b, v1.16b}, [x10]
    uaddl  v2.8h, v0.8b, v1.8b
    uaddl2 v3.8h, v0.16b, v1.16b
    add    v0.8h, v2.8h, v3.8h
    b intra_pred_dc_above_left_h

intra_pred_dc_above_left_w64:
    ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x10]
    uaddl  v4.8h, v0.8b, v1.8b
    uaddl2 v5.8h, v0.16b, v1.16b
    uaddl  v6.8h, v2.8b, v3.8b
    uaddl2 v7.8h, v2.16b, v3.16b

    add  v4.8h, v4.8h, v5.8h
    add  v6.8h, v6.8h, v7.8h
    add  v0.8h, v4.8h, v6.8h
    b intra_pred_dc_above_left_h

intra_pred_dc_above_left_w128:
    ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x10], #64
    ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x10]
    uaddl  v16.8h, v0.8b, v1.8b
    uaddl2 v17.8h, v0.16b, v1.16b
    uaddl  v18.8h, v2.8b, v3.8b
    uaddl2 v19.8h, v2.16b, v3.16b
    uaddl  v20.8h, v4.8b, v5.8b
    uaddl2 v21.8h, v4.16b, v5.16b
    uaddl  v22.8h, v6.8b, v7.8b
    uaddl2 v23.8h, v6.16b, v7.16b

    add  v16.8h, v16.8h, v17.8h
    add  v18.8h, v18.8h, v19.8h
    add  v20.8h, v20.8h, v21.8h
    add  v22.8h, v22.8h, v23.8h

    add  v16.8h, v16.8h, v18.8h
    add  v20.8h, v20.8h, v22.8h
    add  v0.8h, v16.8h, v20.8h

intra_pred_dc_above_left_h:

    //branch
    cmp w4, #16
    beq intra_pred_dc_above_left_h16
    bgt intra_pred_dc_above_left_h32x

    cmp w4, #8
    beq intra_pred_dc_above_left_h8

//intra_pred_dc_above_left_h4:
    movi v1.8h, #0
    sub  x10, x0, #4
    ld1 {v1.s}[0], [x10]
    uaddw v0.8h, v0.8h, v1.8b
    b intra_pred_dc_above_left_dcvalue

intra_pred_dc_above_left_h8:
    sub  x10, x0, #8
    ld1 {v1.8b}, [x10]
    uaddw v0.8h, v0.8h, v1.8b
    b intra_pred_dc_above_left_dcvalue

intra_pred_dc_above_left_h16:
    sub  x10, x0, #16
    ld1 {v1.8b, v2.8b}, [x10]
    uaddl v1.8h, v1.8b, v2.8b
    add   v0.8h, v0.8h, v1.8h
    b intra_pred_dc_above_left_dcvalue

intra_pred_dc_above_left_h32x:
    cmp w4, #64
    beq intra_pred_dc_above_left_h64
    bgt intra_pred_dc_above_left_h128

    sub  x10, x0, #32
    ld1 {v1.16b, v2.16b}, [x10]
    uaddl  v3.8h, v1.8b, v2.8b
    uaddl2 v4.8h, v1.16b, v2.16b
    add    v3.8h, v3.8h, v4.8h
    add    v0.8h, v0.8h, v3.8h
    b intra_pred_dc_above_left_dcvalue

intra_pred_dc_above_left_h64:
    sub  x10, x0, #64
    ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x10]
    uaddl  v16.8h, v4.8b , v5.8b
    uaddl2 v17.8h, v4.16b, v5.16b
    uaddl  v18.8h, v6.8b , v7.8b
    uaddl2 v19.8h, v6.16b, v7.16b

    add  v2.8h, v16.8h, v17.8h
    add  v3.8h, v18.8h, v19.8h
    add  v2.8h, v2.8h, v3.8h
    add  v0.8h, v0.8h, v2.8h
    b intra_pred_dc_above_left_dcvalue

intra_pred_dc_above_left_h128:
    sub  x10, x0, #128
    ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x10], #64
    ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x10]
    uaddl  v16.8h, v24.8b, v25.8b
    uaddl2 v17.8h, v24.16b, v25.16b
    uaddl  v18.8h, v26.8b, v27.8b
    uaddl2 v19.8h, v26.16b, v27.16b
    uaddl  v20.8h, v4.8b, v5.8b
    uaddl2 v21.8h, v4.16b, v5.16b
    uaddl  v22.8h, v6.8b, v7.8b
    uaddl2 v23.8h, v6.16b, v7.16b

    add  v16.8h, v16.8h, v17.8h
    add  v18.8h, v18.8h, v19.8h
    add  v20.8h, v20.8h, v21.8h
    add  v22.8h, v22.8h, v23.8h

    add  v16.8h, v16.8h, v18.8h
    add  v20.8h, v20.8h, v22.8h
    add  v1.8h, v16.8h, v20.8h
    add  v0.8h, v0.8h, v1.8h

intra_pred_dc_above_left_dcvalue:

    addp v0.8h, v0.8h, v0.8h
    addp v0.8h, v0.8h, v0.8h
    addp v0.8h, v0.8h, v0.8h

    // (dc + ((w + h) >> 1)) * (4096 / (w + h)) >> 12;
    add w10, w3, w4			    // dc += ((w + h) >> 1);
    lsr w8, w10, #1
    umov w9, v0.h[0]
    add w8, w8, w9

    mov w11, #4096				// dc = (dc * (4096 / (w + h))) >> 12;
    udiv w11, w11, w10
    mul w8, w8, w11
    lsr w8, w8, #12
    dup v0.16b, w8

intra_pred_dc_fillblock:

    //branch
    cmp w3, #16
    beq intra_pred_dc_fillblock_w16
    bgt intra_pred_dc_fillblock_w32x

    cmp w3, #8
    beq intra_pred_dc_fillblock_w8

// intra_pred_dc_fillblock_w4:

intra_pred_dc_fillblock_w4_y:
    st1 {v0.s}[0], [x1], x2  // store dst[x]
    st1 {v0.s}[0], [x1], x2
    subs w4, w4, #4
    st1 {v0.s}[0], [x1], x2
    st1 {v0.s}[0], [x1], x2
    bne intra_pred_dc_fillblock_w4_y
    b   intra_pred_dc_end

intra_pred_dc_fillblock_w8:
    st1 {v0.8b}, [x1], x2       // store dst[x]
    st1 {v0.8b}, [x1], x2
    subs w4, w4, #4
    st1 {v0.8b}, [x1], x2
    st1 {v0.8b}, [x1], x2
    bgt intra_pred_dc_fillblock_w8
    b   intra_pred_dc_end

intra_pred_dc_fillblock_w16:
    st1 {v0.16b}, [x1], x2  // store dst[x]
    st1 {v0.16b}, [x1], x2
    subs w4, w4, #4
    st1 {v0.16b}, [x1], x2
    st1 {v0.16b}, [x1], x2
    bgt intra_pred_dc_fillblock_w16
    b   intra_pred_dc_end

intra_pred_dc_fillblock_w32x:

    cmp w3, #64
    beq intra_pred_dc_fillblock_w64
    bgt intra_pred_dc_fillblock_w128

    mov v1.16b, v0.16b
intra_pred_dc_fillblock_w32_y:
    st1 {v0.16b, v1.16b}, [x1], x2  // store dst[x]
    st1 {v0.16b, v1.16b}, [x1], x2
    subs w4, w4, #4
    st1 {v0.16b, v1.16b}, [x1], x2
    st1 {v0.16b, v1.16b}, [x1], x2
    bgt intra_pred_dc_fillblock_w32_y
    b   intra_pred_dc_end

intra_pred_dc_fillblock_w64:

    mov v1.16b, v0.16b
    mov v2.16b, v0.16b
    mov v3.16b, v0.16b
intra_pred_dc_fillblock_w64_y:
    st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], x2  // store dst[x]
    st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], x2
    st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], x2
    st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], x2
    subs w4, w4, #8
    st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], x2
    st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], x2
    st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], x2
    st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], x2
    bgt intra_pred_dc_fillblock_w64_y
    b   intra_pred_dc_end
intra_pred_dc_fillblock_w128:
    sub x2, x2, #64
    mov v1.16b, v0.16b
    mov v2.16b, v0.16b
    mov v3.16b, v0.16b
intra_pred_dc_fillblock_w128_y:
    st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64
    st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], x2
    st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64
    st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], x2
    st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64
    st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], x2
    st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64
    st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], x2
    subs w4, w4, #8
    st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64
    st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], x2
    st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64
    st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], x2
    st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64
    st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], x2
    st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64
    st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], x2
    bgt intra_pred_dc_fillblock_w128_y

intra_pred_dc_end:
    ret

intra_plane_mul_shift:
.byte 13, 7, 17, 10, 5, 11, 11, 15, 23, 19

intra_plane_coef:
.byte 8, 7, 6, 5, 4, 3, 2, 1, 0, 1, 2, 3, 4, 5, 6, 7, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0

//void uavs3d_intra_pred_plane_arm64(pel *src, pel *dst, int i_dst, int width, int height, int bit_depth)
//src->x0, dst->x1, i_dst->x2, width->x3, height->x4, bit_depth->x5
function uavs3d_intra_pred_plane_arm64

    sub sp, sp, #16
    stp x19, x20, [sp]

    mov x9, #61
    clz x7, x3
    clz x8, x4
    sub x15, x9, x7  					// tab_log2[width] - 2
    sub x14, x9, x8					    // tab_log2[height] - 2

    movi v6.2s, #0
    movi v7.2s, #0

    adr x19, intra_plane_mul_shift
    lsl w15, w15, #1
    add x15, x19, x15					// im_h, is_h
    ld2 {v6.b, v7.b}[0], [x15]

    lsl w14, w14, #1
    add x14, x19, x14					// im_v, is_v
    ld2 {v6.b, v7.b}[4], [x14]

    lsr x10, x3, #1						// iW2 = width >> 1;
    lsr x11, x4, #1						// iH2 = height >> 1;

    add x19, x0, x10					// rpSrc = pSrc + 1;  rpSrc += (iW2 - 1);

    cmp x10, #4
    beq intra_pred_plane_coef_h_loop4
    bgt intra_pred_plane_coef_h_loop8

// intra_pred_plane_coef_h_loop2

    ldrb w12, [x19, #1]
    ldrb w13, [x19, #-1]
    sub w14, w12, w13
    ldrb w12, [x19, #2]
    ldrb w13, [x19, #-2]
    sub w15, w12, w13
    lsl w15, w15, #1
    add w5, w14, w15
    movi v4.4s, #0
    mov v4.s[0], w5
    b intra_pred_plane_coef_h_end

intra_pred_plane_coef_h_loop4:

    adr x12, intra_plane_coef
    add x12, x12, #4
    ld1 {v2.8b}, [x12]			        // 4, 3, 2, 1
    uxtl v3.8h, v2.8b

    ld1 {v0.8b}, [x19]
    sub x19, x19, #4
    ld1 {v1.s}[0], [x19]
    tbl v0.8b, {v0.16b}, v2.8b

    usubl v0.8h, v0.8b, v1.8b
    smull v4.4s, v0.4h, v3.4h

    b intra_pred_plane_coef_h_end

intra_pred_plane_coef_h_loop8:

    mov w13, w10

    adr x12, intra_plane_coef
    ld1 {v2.8b}, [x12]			        // 8, 7, 6, 5, 4, 3, 2, 1
    uxtl v3.8h, v2.8b
    movi v4.4s, #0
    movi v16.8h, #8
    sub x20, x19, #8

intra_pred_plane_coef_h_loop8_x:

    ld1 {v0.16b}, [x19]
    ld1 {v1.8b}, [x20]
    tbl v0.16b, {v0.16b}, v2.16b

    usubl v0.8h, v0.8b, v1.8b
    smlal v4.4s, v0.4h, v3.4h
    smlal2 v4.4s, v0.8h, v3.8h

    add v3.8h, v3.8h, v16.8h
    subs w13, w13, #8
    add x19, x19, #8
    sub x20, x20, #8
    bgt intra_pred_plane_coef_h_loop8_x

//v4 -> coef_h
intra_pred_plane_coef_h_end:

    sub x19, x0, x11					// rpSrc = pSrc - 1;  rpSrc -= (iH2 - 1);

    cmp x11, #4
    beq intra_pred_plane_coef_v_loop4
    bgt intra_pred_plane_coef_v_loop8

// intra_pred_plane_coef_v_loop2

    ldrb w12, [x19, #1]
    ldrb w13, [x19, #-1]
    sub w14, w13, w12
    ldrb w12, [x19, #2]
    ldrb w13, [x19, #-2]
    sub w15, w13, w12
    lsl w15, w15, #1
    add w5, w14, w15
    movi v5.4s, #0
    mov v5.s[1], w5
    b intra_pred_plane_coef_v_end

intra_pred_plane_coef_v_loop4:

    adr x12, intra_plane_coef
    add x12, x12, #4
    ld1 {v2.8b}, [x12]			// 4, 3, 2, 1
    uxtl v3.8h, v2.8b

    ld1 {v0.8b}, [x19]
    sub x19, x19, #4
    ld1 {v1.s}[0], [x19]
    tbl v0.8b, {v0.16b}, v2.8b

    usubl v0.8h, v1.8b, v0.8b
    smull v5.4s, v0.4h, v3.4h

    b intra_pred_plane_coef_v_end

intra_pred_plane_coef_v_loop8:

    mov w13, w11

    adr x12, intra_plane_coef
    ld1 {v2.8b}, [x12]			// 8, 7, 6, 5, 4, 3, 2, 1
    uxtl v3.8h, v2.8b
    movi v5.4s, #0
    movi v16.8h, #8
    sub x20, x19, #8

intra_pred_plane_coef_v_loop8_x:

    ld1 {v0.16b}, [x19]
    ld1 {v1.8b}, [x20]
    tbl v0.16b, {v0.16b}, v2.16b

    usubl v0.8h, v1.8b, v0.8b
    smlal v5.4s, v0.4h, v3.4h
    smlal2 v5.4s, v0.8h, v3.8h

    add v3.8h, v3.8h, v16.8h
    subs w13, w13, #8
    add x19, x19, #8
    sub x20, x20, #8
    bgt intra_pred_plane_coef_v_loop8_x

//v5 -> coef_v
intra_pred_plane_coef_v_end:

    addp v4.4s, v4.4s, v5.4s
    addp v4.4s, v4.4s, v4.4s      // v4.4s[0]->coef_h; v4.4s[1]->coef_v;

    // iA = (pSrc[-1 - (height - 1)] + pSrc[1 + width - 1]) << 4
    sub x6, x0, x4
    ldrb w7, [x6]
    add x6, x0, x3
    ldrb w8, [x6]
    add w6, w7, w8
    lsl w6, w6, #4

    // iB = ((coef_h << 5) * im_h + (1 << (is_h - 1))) >> is_h;
    // iC = ((coef_v << 5) * im_v + (1 << (is_v - 1))) >> is_v;
    shl v4.2s, v4.2s, #5
    mul v4.2s, v4.2s, v6.2s
    neg v7.2s, v7.2s
    srshl v4.2s, v4.2s, v7.2s
    umov w12, v4.s[0]
    umov w13, v4.s[1]
    dup v30.8h, w12				//v30->iB
    dup v31.8h, w13				//v31->iC

    // iTmp = iA - (iH2 - 1) * iC - (iW2 - 1) * iB + 16
    sub w10, w10, #1
    sub w11, w11, #1
    mul w10, w10, w12
    mul w11, w11, w13
    sub w6, w6, w10
    sub w6, w6, w11
    add w6, w6, #16
    dup v0.8h, w6				// v0->iTmp

    adr x12, intra_plane_coef
    add x12, x12, #8
    ld1 {v2.8b}, [x12]			// 0, 1, 2, 3, 4, 5, 6, 7

    cmp x3, #4
    bne intra_pred_plane_fill_loop8

//intra_pred_plane_fill_loop4:

    sxtl v2.8h, v2.8b
    mul v30.4h, v30.4h, v2.4h

    movi v28.4h, #0			//max and min val
    movi v29.4h, #255

    add v0.4h, v0.4h, v30.4h
intra_pred_plane_fill_loop4_y:

// dst[x] = Clip3(0, vmax, iTmp2 >> 5);
    sshr v1.4h, v0.4h, #5
    smax v1.4h, v1.4h, v28.4h
    smin v1.4h, v1.4h, v29.4h
    xtn v1.8b, v1.8h
    st1 {v1.s}[0], [x1], x2

    subs w4, w4, #1
    add v0.4h, v0.4h, v31.4h   //iTmp += iC;
    bgt intra_pred_plane_fill_loop4_y

    b intra_pred_plane_fill_end

intra_pred_plane_fill_loop8:

    sxtl v2.8h, v2.8b
    mul v26.8h, v30.8h, v2.8h

    movi v28.8h, #0			//max and min val
    movi v29.8h, #255

    shl v27.8h, v30.8h, #3  // iB * 8

    add v0.8h, v0.8h, v26.8h
intra_pred_plane_fill_loop8_x:

    mov v1.16b, v0.16b
    mov x19, x1
    mov w8, w4
intra_pred_plane_fill_loop8_y:

    sshr v2.8h, v1.8h, #5
    smax v2.8h, v2.8h, v28.8h
    smin v2.8h, v2.8h, v29.8h

    xtn v2.8b, v2.8h
    st1 {v2.8b}, [x19], x2

    subs w8, w8, #1
    add v1.8h, v1.8h, v31.8h   //iTmp += iC;
    bgt intra_pred_plane_fill_loop8_y

    add x1, x1, #8
    subs w3, w3, #8
    add v0.8h, v0.8h, v27.8h
    bgt intra_pred_plane_fill_loop8_x

intra_pred_plane_fill_end:
    ldp x19, x20, [sp], #16
    ret

//void uavs3d_intra_pred_plane_ipf_arm64(pel *src, s16 *dst, int width, int height)
//src->x0, dst->x1, width->x2, height->x3
function uavs3d_intra_pred_plane_ipf_arm64

    sub sp, sp, #16
    stp x19, x20, [sp]
    mov x4, x3
    mov x3, x2
    lsl x2, x2, #1                      // idst = width << sizeof(s16)

    mov x9, #61
    clz x7, x3
    clz x8, x4
    sub x15, x9, x7                     // tab_log2[width] - 2
    sub x14, x9, x8                     // tab_log2[height] - 2

    movi v6.2s, #0
    movi v7.2s, #0

    adr x19, intra_plane_mul_shift
    lsl w15, w15, #1
    add x15, x19, x15                   // im_h, is_h
    ld2 {v6.b, v7.b}[0], [x15]

    lsl w14, w14, #1
    add x14, x19, x14                   // im_v, is_v
    ld2 {v6.b, v7.b}[4], [x14]

    lsr x10, x3, #1                     // iW2 = width >> 1;
    lsr x11, x4, #1                     // iH2 = height >> 1;

    add x19, x0, x10                    // rpSrc = pSrc + 1;  rpSrc += (iW2 - 1);

    cmp x10, #4
    beq intra_pred_plane_ipf_coef_h_loop4
    bgt intra_pred_plane_ipf_coef_h_loop8

// intra_pred_plane_ipf_coef_h_loop2

    ldrb w12, [x19, #1]
    ldrb w13, [x19, #-1]
    sub w14, w12, w13
    ldrb w12, [x19, #2]
    ldrb w13, [x19, #-2]
    sub w15, w12, w13
    lsl w15, w15, #1
    add w5, w14, w15
    movi v4.4s, #0
    mov v4.s[0], w5
    b intra_pred_plane_ipf_coef_h_end

intra_pred_plane_ipf_coef_h_loop4:

    adr x12, intra_plane_coef
    add x12, x12, #4
    ld1 {v2.8b}, [x12]                    // 4, 3, 2, 1
    uxtl v3.8h, v2.8b

    ld1 {v0.8b}, [x19]
    sub x19, x19, #4
    ld1 {v1.s}[0], [x19]
    tbl v0.8b, {v0.16b}, v2.8b

    usubl v0.8h, v0.8b, v1.8b
    smull v4.4s, v0.4h, v3.4h

    b intra_pred_plane_ipf_coef_h_end

intra_pred_plane_ipf_coef_h_loop8:

    mov w13, w10

    adr x12, intra_plane_coef
    ld1 {v2.8b}, [x12]                    // 8, 7, 6, 5, 4, 3, 2, 1
    uxtl v3.8h, v2.8b
    movi v4.4s, #0
    movi v16.8h, #8
    sub x20, x19, #8

intra_pred_plane_ipf_coef_h_loop8_x:

    ld1 {v0.16b}, [x19]
    ld1 {v1.8b}, [x20]
    tbl v0.16b, {v0.16b}, v2.16b

    usubl v0.8h, v0.8b, v1.8b
    smlal v4.4s, v0.4h, v3.4h
    smlal2 v4.4s, v0.8h, v3.8h

    add v3.8h, v3.8h, v16.8h
    subs w13, w13, #8
    add x19, x19, #8
    sub x20, x20, #8
    bgt intra_pred_plane_ipf_coef_h_loop8_x

//v4 -> coef_h
intra_pred_plane_ipf_coef_h_end:

    sub x19, x0, x11                    // rpSrc = pSrc - 1;  rpSrc -= (iH2 - 1);

    cmp x11, #4
    beq intra_pred_plane_ipf_coef_v_loop4
    bgt intra_pred_plane_ipf_coef_v_loop8

// intra_pred_plane_ipf_coef_v_loop2

    ldrb w12, [x19, #1]
    ldrb w13, [x19, #-1]
    sub w14, w13, w12
    ldrb w12, [x19, #2]
    ldrb w13, [x19, #-2]
    sub w15, w13, w12
    lsl w15, w15, #1
    add w5, w14, w15
    movi v5.4s, #0
    mov v5.s[1], w5
    b intra_pred_plane_ipf_coef_v_end

intra_pred_plane_ipf_coef_v_loop4:

    adr x12, intra_plane_coef
    add x12, x12, #4
    ld1 {v2.8b}, [x12]            // 4, 3, 2, 1
    uxtl v3.8h, v2.8b

    ld1 {v0.8b}, [x19]
    sub x19, x19, #4
    ld1 {v1.s}[0], [x19]
    tbl v0.8b, {v0.16b}, v2.8b

    usubl v0.8h, v1.8b, v0.8b
    smull v5.4s, v0.4h, v3.4h

    b intra_pred_plane_ipf_coef_v_end

intra_pred_plane_ipf_coef_v_loop8:

    mov w13, w11

    adr x12, intra_plane_coef
    ld1 {v2.8b}, [x12]            // 8, 7, 6, 5, 4, 3, 2, 1
    uxtl v3.8h, v2.8b
    movi v5.4s, #0
    movi v16.8h, #8
    sub x20, x19, #8

intra_pred_plane_ipf_coef_v_loop8_x:

    ld1 {v0.16b}, [x19]
    ld1 {v1.8b}, [x20]
    tbl v0.16b, {v0.16b}, v2.16b

    usubl v0.8h, v1.8b, v0.8b
    smlal v5.4s, v0.4h, v3.4h
    smlal2 v5.4s, v0.8h, v3.8h

    add v3.8h, v3.8h, v16.8h
    subs w13, w13, #8
    add x19, x19, #8
    sub x20, x20, #8
    bgt intra_pred_plane_ipf_coef_v_loop8_x

//v5 -> coef_v
intra_pred_plane_ipf_coef_v_end:

    addp v4.4s, v4.4s, v5.4s
    addp v4.4s, v4.4s, v4.4s      // v4.4s[0]->coef_h; v4.4s[1]->coef_v;

    // iA = (pSrc[-1 - (height - 1)] + pSrc[1 + width - 1]) << 4
    sub x6, x0, x4
    ldrb w7, [x6]
    add x6, x0, x3
    ldrb w8, [x6]
    add w6, w7, w8
    lsl w6, w6, #4

    // iB = ((coef_h << 5) * im_h + (1 << (is_h - 1))) >> is_h;
    // iC = ((coef_v << 5) * im_v + (1 << (is_v - 1))) >> is_v;
    shl v4.2s, v4.2s, #5
    mul v4.2s, v4.2s, v6.2s
    neg v7.2s, v7.2s
    srshl v4.2s, v4.2s, v7.2s
    umov w12, v4.s[0]
    umov w13, v4.s[1]
    dup v30.8h, w12                //v30->iB
    dup v31.8h, w13                //v31->iC

    // iTmp = iA - (iH2 - 1) * iC - (iW2 - 1) * iB + 16
    sub w10, w10, #1
    sub w11, w11, #1
    mul w10, w10, w12
    mul w11, w11, w13
    sub w6, w6, w10
    sub w6, w6, w11
    add w6, w6, #16
    dup v0.8h, w6                // v0->iTmp

    adr x12, intra_plane_coef
    add x12, x12, #8
    ld1 {v2.8b}, [x12]            // 0, 1, 2, 3, 4, 5, 6, 7

    cmp x3, #4
    bne intra_pred_plane_ipf_fill_loop8

//intra_pred_plane_ipf_fill_loop4:

    sxtl v2.8h, v2.8b
    mul v30.4h, v30.4h, v2.4h

    add v0.4h, v0.4h, v30.4h
intra_pred_plane_ipf_fill_loop4_y:

    sshr v1.4h, v0.4h, #5
    st1 {v1.4h}, [x1], x2

    subs w4, w4, #1
    add v0.4h, v0.4h, v31.4h    // iTmp += iC;
    bgt intra_pred_plane_ipf_fill_loop4_y

    b intra_pred_plane_ipf_fill_end

intra_pred_plane_ipf_fill_loop8:

    sxtl v2.8h, v2.8b
    mul v26.8h, v30.8h, v2.8h

    shl v27.8h, v30.8h, #3      // iB * 8

    add v0.8h, v0.8h, v26.8h
intra_pred_plane_ipf_fill_loop8_x:

    mov v1.16b, v0.16b
    mov x19, x1
    mov w8, w4
intra_pred_plane_ipf_fill_loop8_y:

    sshr v2.8h, v1.8h, #5
    st1 {v2.8h}, [x19], x2

    subs w8, w8, #1
    add v1.8h, v1.8h, v31.8h    // iTmp += iC;
    bgt intra_pred_plane_ipf_fill_loop8_y

    add x1, x1, #16
    subs w3, w3, #8
    add v0.8h, v0.8h, v27.8h
    bgt intra_pred_plane_ipf_fill_loop8_x

intra_pred_plane_ipf_fill_end:
    ldp x19, x20, [sp], #16
    ret


intra_bi_tbl_wc:
.byte -1, 21, 13, 7, 4, 2, 0, 0

//void uavs3d_intra_pred_bi_arm64(pel *pSrc, pel *dst, int i_dst, int width, int height, int sample_bit_depth)
//pSrc->x0, dst->x1, i_dst->x2, width->x3, height->x4, sample_bit_depth->x5
function uavs3d_intra_pred_bi_arm64
    sub sp, sp, #16
    stp x19, x20, [sp]

    mov x9, #63
    clz x7, x3
    clz x8, x4
    sub x6, x9, x7                  // ishift_x = tab_log2size[width];
    sub x7, x9, x8                  // ishift_y = tab_log2size[height];

    dup v4.8h, w6
    dup v5.8h, w7
    umin v6.8h, v4.8h, v5.8h

    add x19, x0, x3                 // a = ref_up[width - 1] = pSrc[width]
    ldrb w12, [x19]
    sub x19, x0, x4                 // b = ref_le[height - 1] = pSrc[-height]
    ldrb w13, [x19]

    dup v0.8h, w12
    dup v1.8h, w13

    cmp x3, x4
    bne intra_pred_bi_width_ne_height

//intra_pred_bi_width_eq_height:
    urhadd v2.8h, v0.8h, v1.8h      // c = (a + b + 1) >> 1
    shl v2.8h, v2.8h, #1            // w = (c << 1) - a - b;
    sub v2.8h, v2.8h, v0.8h
    sub v2.8h, v2.8h, v1.8h

    b intra_pred_bi_reflines

intra_pred_bi_width_ne_height:
    cmp x6, x7
    bgt intra_pred_bi_width_gt_height
    mov x8, x6                      // ishift
    sub x9, x7, x6                  // wc
    b   intra_pred_bi_abcw
intra_pred_bi_width_gt_height:
    mov w8, w7
    sub w9, w6, w7
intra_pred_bi_abcw:
    lsl w12, w12, w6                // a << ishift_x
    lsl w13, w13, w7                // b << ishift_y
    add w12, w12, w13
    adr x14, intra_bi_tbl_wc
    add x14, x14, x9
    ldrsb w15, [x14]
    mov w10, #1
    add w11, w8, #5                 // ishift + 5
    add w8, w8, #6
    mul w12, w12, w15
    lsl w10, w10, w11               // 1 << (ishift + 5)
    add w12, w12, w10
    lsr w12, w12, w8                // c = (((a << ishift_x) + (b << ishift_y)) * 13 + (1 << (ishift + 5))) >> (ishift + 6)

    dup v2.8h, w12

    shl v2.8h, v2.8h, #1            // w = (c << 1) - a - b;
    sub v2.8h, v2.8h, v0.8h
    sub v2.8h, v2.8h, v1.8h

intra_pred_bi_reflines:

    // align (x19)
    // x19-->tmp
    mov x19, sp
    sub sp, sp, #3072

    sub x20, x19, #1024             // ref_up
    sub x15, x19, #1536             // up
    sub x14, x19, #2048             // ref_le
    sub x13, x19, #2560             // le
    sub x12, x19, #3072             // wy

    // ref_up
    add x5, x0, #1

    cmp w3, #16
    beq intra_pred_bi_refup_w16
    bgt intra_pred_bi_refup_w32x
    cmp w3, #4
    beq intra_pred_bi_refup_w4

//intra_pred_bi_refup_w8:
    ld1 {v16.8b}, [x5]
    uxtl v16.8h, v16.8b
    sub v17.8h, v1.8h, v16.8h
    st1 {v17.8h}, [x15]
    sshl v17.8h, v16.8h, v5.8h
    st1 {v17.8h}, [x20]

    b intra_pred_bi_refup_end

intra_pred_bi_refup_w4:
    ld1 {v16.8b}, [x5]
    uxtl v16.8h, v16.8b
    sub v17.4h, v1.4h, v16.4h
    st1 {v17.4h}, [x15]
    sshl v17.4h, v16.4h, v5.4h
    st1 {v17.4h}, [x20]

    b intra_pred_bi_refup_end

intra_pred_bi_refup_w16:
    ld1 {v16.16b}, [x5]
    uxtl v17.8h, v16.8b
    uxtl2 v18.8h, v16.16b
    sub v19.8h, v1.8h, v17.8h
    sub v20.8h, v1.8h, v18.8h
    st1 {v19.8h, v20.8h}, [x15]     // up
    sshl v17.8h, v17.8h, v5.8h
    sshl v18.8h, v18.8h, v5.8h
    st1 {v17.8h, v18.8h}, [x20]     // ref_up

    b intra_pred_bi_refup_end

intra_pred_bi_refup_w32x:

    cmp w3, #64
    beq intra_pred_bi_refup_w64

//intra_pred_bi_refup_w32:
    ld1 {v16.16b, v17.16b}, [x5]
    uxtl v18.8h, v16.8b
    uxtl2 v19.8h, v16.16b
    uxtl v20.8h, v17.8b
    uxtl2 v21.8h, v17.16b
    sub v22.8h, v1.8h, v18.8h
    sub v23.8h, v1.8h, v19.8h
    sub v24.8h, v1.8h, v20.8h
    sub v25.8h, v1.8h, v21.8h
    st1 {v22.8h, v23.8h, v24.8h, v25.8h}, [x15]
    sshl v22.8h, v18.8h, v5.8h
    sshl v23.8h, v19.8h, v5.8h
    sshl v24.8h, v20.8h, v5.8h
    sshl v25.8h, v21.8h, v5.8h
    st1 {v22.8h, v23.8h, v24.8h, v25.8h}, [x20]

    b intra_pred_bi_refup_end

intra_pred_bi_refup_w64:
    ld1 {v16.16b, v17.16b}, [x5], #32
    uxtl v18.8h, v16.8b
    uxtl2 v19.8h, v16.16b
    uxtl v20.8h, v17.8b
    uxtl2 v21.8h, v17.16b
    sub v22.8h, v1.8h, v18.8h
    sub v23.8h, v1.8h, v19.8h
    sub v24.8h, v1.8h, v20.8h
    sub v25.8h, v1.8h, v21.8h
    st1 {v22.8h, v23.8h, v24.8h, v25.8h}, [x15], #64
    sshl v22.8h, v18.8h, v5.8h
    sshl v23.8h, v19.8h, v5.8h
    sshl v24.8h, v20.8h, v5.8h
    sshl v25.8h, v21.8h, v5.8h
    st1 {v22.8h, v23.8h, v24.8h, v25.8h}, [x20], #64

    ld1 {v16.16b, v17.16b}, [x5]
    uxtl v18.8h, v16.8b
    uxtl2 v19.8h, v16.16b
    uxtl v20.8h, v17.8b
    uxtl2 v21.8h, v17.16b
    sub v22.8h, v1.8h, v18.8h
    sub v23.8h, v1.8h, v19.8h
    sub v24.8h, v1.8h, v20.8h
    sub v25.8h, v1.8h, v21.8h
    st1 {v22.8h, v23.8h, v24.8h, v25.8h}, [x15]
    sshl v22.8h, v18.8h, v5.8h
    sshl v23.8h, v19.8h, v5.8h
    sshl v24.8h, v20.8h, v5.8h
    sshl v25.8h, v21.8h, v5.8h
    st1 {v22.8h, v23.8h, v24.8h, v25.8h}, [x20]

    sub x15, x15, #64
    sub x20, x20, #64

intra_pred_bi_refup_end:

    // ref_le and le and wy
    cmp w4, #16
    beq intra_pred_bi_refle_h16
    bgt intra_pred_bi_refle_h32x
    cmp w4, #8
    blt intra_pred_bi_refle_h4
//intra_pred_bi_refle_h8:

    adr x19, intra_plane_coef
    add x19, x19, #8
    ld1 {v6.8b}, [x19]              // 0, 1, 2, 3, 4, 5, 6, 7
    add x19, x19, #16
    ld1 {v3.8b}, [x19]              // 7, 6, 5, 4, 3, 2, 1, 0
    sxtl v6.8h, v6.8b

    sub x5, x0, #8
    ld1 {v16.8b}, [x5]
    tbl v16.8b, {v16.16b}, v3.8b
    uxtl v16.8h, v16.8b
    sub v17.8h, v0.8h, v16.8h
    st1 {v17.8h}, [x13]             // le
    sshl v17.8h, v16.8h, v4.8h
    st1 {v17.8h}, [x14]             // ref_le
    mul v17.8h, v2.8h, v6.8h
    st1 {v17.8h}, [x12]             // wy

    b intra_pred_bi_refle_end

intra_pred_bi_refle_h4:

    adr x19, intra_plane_coef
    add x19, x19, #8
    ld1 {v6.8b}, [x19]              // 0, 1, 2, 3, 4, 5, 6, 7
    add x19, x19, #20
    ld1 {v3.s}[0], [x19]            // 3, 2, 1, 0
    sxtl v6.8h, v6.8b

    sub x5, x0, #4
    ld1 {v16.s}[0], [x5]
    tbl v16.8b, {v16.16b}, v3.8b
    uxtl v16.8h, v16.8b
    sub v17.4h, v0.4h, v16.4h
    st1 {v17.4h}, [x13]
    sshl v17.4h, v16.4h, v4.4h
    st1 {v17.4h}, [x14]
    mul v17.4h, v2.4h, v6.4h
    st1 {v17.4h}, [x12]

    b intra_pred_bi_refle_end

intra_pred_bi_refle_h16:

    adr x19, intra_plane_coef
    add x19, x19, #8
    ld1 {v6.8b}, [x19]              // 0, 1, 2, 3, 4, 5, 6, 7
    sxtl v6.8h, v6.8b
    add x19, x19, #8
    ld1 {v3.16b}, [x19]             // 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0

    sub x5, x0, #16
    ld1 {v16.16b}, [x5]
    tbl v16.16b, {v16.16b}, v3.16b
    uxtl v17.8h, v16.8b
    uxtl2 v18.8h, v16.16b
    sub v19.8h, v0.8h, v17.8h
    sub v20.8h, v0.8h, v18.8h
    st1 {v19.8h, v20.8h}, [x13]
    sshl v19.8h, v17.8h, v4.8h
    sshl v20.8h, v18.8h, v4.8h
    st1 {v19.8h, v20.8h}, [x14]
    movi v18.8h, #8
    mul v17.8h, v2.8h, v6.8h
    mul v18.8h, v18.8h, v2.8h
    add v18.8h, v18.8h, v17.8h
    st1 {v17.8h, v18.8h}, [x12]

    b intra_pred_bi_refle_end

intra_pred_bi_refle_h32x:

    cmp w4, #64
    beq intra_pred_bi_refle_h64

//intra_pred_bi_refle_h32:

    adr x19, intra_plane_coef
    add x19, x19, #8
    ld1 {v6.8b}, [x19]              // 0, 1, 2, 3, 4, 5, 6, 7
    sxtl v6.8h, v6.8b
    add x19, x19, #8
    ld1 {v3.16b}, [x19]             // 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0

    sub x5, x0, #32
    ld1 {v16.16b, v17.16b}, [x5]
    tbl v16.16b, {v16.16b}, v3.16b
    tbl v17.16b, {v17.16b}, v3.16b
    uxtl v18.8h, v17.8b
    uxtl2 v19.8h, v17.16b
    uxtl v20.8h, v16.8b
    uxtl2 v21.8h, v16.16b
    sub v22.8h, v0.8h, v18.8h
    sub v23.8h, v0.8h, v19.8h
    sub v24.8h, v0.8h, v20.8h
    sub v25.8h, v0.8h, v21.8h
    st1 {v22.8h, v23.8h, v24.8h, v25.8h}, [x13]
    sshl v22.8h, v18.8h, v4.8h
    sshl v23.8h, v19.8h, v4.8h
    sshl v24.8h, v20.8h, v4.8h
    sshl v25.8h, v21.8h, v4.8h
    st1 {v22.8h, v23.8h, v24.8h, v25.8h}, [x14]
    movi v17.8h, #8
    mul v17.8h, v17.8h, v2.8h
    mul v18.8h, v2.8h, v6.8h
    add v19.8h, v18.8h, v17.8h
    add v20.8h, v19.8h, v17.8h
    add v21.8h, v20.8h, v17.8h
    st1 {v18.8h, v19.8h, v20.8h, v21.8h}, [x12]

    b intra_pred_bi_refle_end

intra_pred_bi_refle_h64:

    adr x19, intra_plane_coef
    add x19, x19, #8
    ld1 {v6.8b}, [x19]              // 0, 1, 2, 3, 4, 5, 6, 7
    sxtl v6.8h, v6.8b
    add x19, x19, #8
    ld1 {v3.16b}, [x19]             // 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0

    sub x5, x0, #32
    ld1 {v16.16b, v17.16b}, [x5]
    tbl v16.16b, {v16.16b}, v3.16b
    tbl v17.16b, {v17.16b}, v3.16b
    uxtl v18.8h, v17.8b
    uxtl2 v19.8h, v17.16b
    uxtl v20.8h, v16.8b
    uxtl2 v21.8h, v16.16b
    sub v22.8h, v0.8h, v18.8h
    sub v23.8h, v0.8h, v19.8h
    sub v24.8h, v0.8h, v20.8h
    sub v25.8h, v0.8h, v21.8h
    st1 {v22.8h, v23.8h, v24.8h, v25.8h}, [x13], #64
    sshl v22.8h, v18.8h, v4.8h
    sshl v23.8h, v19.8h, v4.8h
    sshl v24.8h, v20.8h, v4.8h
    sshl v25.8h, v21.8h, v4.8h
    st1 {v22.8h, v23.8h, v24.8h, v25.8h}, [x14], #64

    sub x5, x5, #32
    ld1 {v16.16b, v17.16b}, [x5]
    tbl v16.16b, {v16.16b}, v3.16b
    tbl v17.16b, {v17.16b}, v3.16b
    uxtl v18.8h, v17.8b
    uxtl2 v19.8h, v17.16b
    uxtl v20.8h, v16.8b
    uxtl2 v21.8h, v16.16b
    sub v22.8h, v0.8h, v18.8h
    sub v23.8h, v0.8h, v19.8h
    sub v24.8h, v0.8h, v20.8h
    sub v25.8h, v0.8h, v21.8h
    st1 {v22.8h, v23.8h, v24.8h, v25.8h}, [x13]
    sshl v22.8h, v18.8h, v4.8h
    sshl v23.8h, v19.8h, v4.8h
    sshl v24.8h, v20.8h, v4.8h
    sshl v25.8h, v21.8h, v4.8h
    st1 {v22.8h, v23.8h, v24.8h, v25.8h}, [x14]

    movi v17.8h, #8
    mul v17.8h, v17.8h, v2.8h
    mul v18.8h, v2.8h, v6.8h
    add v19.8h, v18.8h, v17.8h
    add v20.8h, v19.8h, v17.8h
    add v21.8h, v20.8h, v17.8h
    add v22.8h, v21.8h, v17.8h
    add v23.8h, v22.8h, v17.8h
    add v24.8h, v23.8h, v17.8h
    add v25.8h, v24.8h, v17.8h
    st1 {v18.8h, v19.8h, v20.8h, v21.8h}, [x12], #64
    st1 {v22.8h, v23.8h, v24.8h, v25.8h}, [x12]

    sub x12, x12, #64
    sub x13, x13, #64
    sub x14, x14, #64
    
intra_pred_bi_refle_end:

    cmp w3, #16
    beq intra_pred_bi_fill_block_w16
    bgt intra_pred_bi_fill_block_w32x
    cmp w3, #8
    blt intra_pred_bi_fill_block_w4

//intra_pred_bi_fill_block_w8:

    ld1 {v18.8h}, [x20]                 // ref_up
    ld1 {v20.8h}, [x15]                 // up

    sxtl v6.4s, v6.4h                   // 0, 1, 2, 3

    uaddl v3.4s, v4.4h, v5.4h           // ishift_x + ishift_y;
    neg v3.4s, v3.4s

intra_pred_bi_fill_block_w8_y:

    add v18.8h, v18.8h, v20.8h          // ref_up[x] += up[x];

    sshll v23.4s, v18.4h, #3            // ref_up[x] << ishift_x (ishift_x = 3)
    sshll2 v24.4s, v18.8h, #3

    ldrsh w10, [x14], #2                // ref_le[y];
    ldrsh w11, [x13], #2                // le[y];

    lsl w10, w10, w7
    lsl w11, w11, w7
    add w10, w10, w11                   // (ref_le[y] << ishift_y) + (le[y] << ishift_y);
    dup v16.4s, w10

    ldrsh w10, [x12], #2
    add w11, w11, w10                   // add = (le[y] << ishift_y) + wy[y]
    dup v17.4s, w11

    mul v21.4s, v17.4s, v6.4s           // [0, 1, 2 ... 7] * add
    shl v22.4s, v17.4s, #2
    add v22.4s, v22.4s, v21.4s

    add v21.4s, v16.4s, v21.4s
    add v22.4s, v16.4s, v22.4s

    add v21.4s, v23.4s, v21.4s
    add v22.4s, v24.4s, v22.4s

    sshl v21.4s, v21.4s, v3.4s          // right shift ishift_x + ishift_y
    sshl v22.4s, v22.4s, v3.4s

    rshrn v21.4h, v21.4s, #1            // right shift 1
    rshrn2 v21.8h, v22.4s, #1

    uqxtn v21.8b, v21.8h

    st1 {v21.8b}, [x1], x2

    subs w4, w4, #1
    bgt intra_pred_bi_fill_block_w8_y

    b intra_pred_bi_end

intra_pred_bi_fill_block_w4:

    ld1 {v18.4h}, [x20]             // ref_up
    ld1 {v20.4h}, [x15]             // up

    sxtl v6.4s, v6.4h               // 0, 1, 2, 3

    uaddl v3.4s, v4.4h, v5.4h       // ishift_x + ishift_y;
    neg v3.4s, v3.4s

intra_pred_bi_fill_block_w4_y:

    add v18.8h, v18.8h, v20.8h      // ref_up[x] += up[x];

    sshll v23.4s, v18.4h, #2        // ref_up[x] << ishift_x (ishift_x = 2)

    ldrsh w10, [x14], #2            // ref_le[y];
    ldrsh w11, [x13], #2            // le[y];

    lsl w10, w10, w7
    lsl w11, w11, w7
    add w10, w10, w11               // (ref_le[y] << ishift_y) + (le[y] << ishift_y);
    dup v16.4s, w10

    ldrsh w10, [x12], #2
    add w11, w11, w10               // add = (pL[y] << ishift_y) + wy[y]
    dup v17.4s, w11

    mul v21.4s, v17.4s, v6.4s       // [0, 1, 2, 3] * add
    add v21.4s, v16.4s, v21.4s

    add v21.4s, v23.4s, v21.4s

    sshl v21.4s, v21.4s, v3.4s      // right shift ishift_x + ishift_y
    rshrn v21.4h, v21.4s, #1        // right shift 1

    uqxtn v21.8b, v21.8h

    subs w4, w4, #1
    st1 {v21.s}[0], [x1], x2
    bgt intra_pred_bi_fill_block_w4_y

    b intra_pred_bi_end

intra_pred_bi_fill_block_w16:

    ld1 {v18.8h, v19.8h}, [x20]     // ref_up
    ld1 {v20.8h, v21.8h}, [x15]     // up

    sxtl v6.4s, v6.4h               // 0, 1, 2, 3

    uaddl v3.4s, v4.4h, v5.4h       // ishift_x + ishift_y;
    neg v3.4s, v3.4s

intra_pred_bi_fill_block_w16_y:

    ldrsh w10, [x14], #2            // ref_le[y];
    ldrsh w11, [x13], #2            // le[y];

    lsl w10, w10, w7
    lsl w11, w11, w7
    add w10, w10, w11               // val = (ref_le[y] << ishift_y) + (le[y] << ishift_y);
    dup v16.4s, w10

    ldrsh w10, [x12], #2
    add w11, w11, w10               // add = (le[y] << ishift_y) + wy[y]
    dup v17.4s, w11

    add v18.8h, v18.8h, v20.8h      // ref_up[x] += up[x];
    add v19.8h, v19.8h, v21.8h

    mul v0.4s, v17.4s, v6.4s        // [0, 1, 2, 3] * add
    shl v1.4s, v17.4s, #2           // 4 * add
    add v2.4s, v1.4s, v0.4s

    sshll v22.4s, v18.4h, #4        // ref_up[x] << ishift_x (ishift_x = 4)
    sshll2 v23.4s, v18.8h, #4

    add v22.4s, v16.4s, v22.4s      // (ref_up[x] << ishift_x) + val
    add v23.4s, v16.4s, v23.4s

    add v22.4s, v0.4s, v22.4s       // (ref_up[x] << ishift_x) + val + add
    add v23.4s, v2.4s, v23.4s

    sshl v22.4s, v22.4s, v3.4s      // right shift ishift_x + ishift_y
    sshl v23.4s, v23.4s, v3.4s

    rshrn v22.4h, v22.4s, #1        // right shift 1
    rshrn2 v22.8h, v23.4s, #1

    uqxtn v24.8b, v22.8h

    add v0.4s, v2.4s, v1.4s
    add v2.4s, v0.4s, v1.4s

    sshll v22.4s, v19.4h, #4        // ref_up[x] << ishift_x (ishift_x = 4)
    sshll2 v23.4s, v19.8h, #4

    add v22.4s, v16.4s, v22.4s      // (ref_up[x] << ishift_x) + val
    add v23.4s, v16.4s, v23.4s

    add v22.4s, v0.4s, v22.4s       // (ref_up[x] << ishift_x) + val + add
    add v23.4s, v2.4s, v23.4s

    sshl v22.4s, v22.4s, v3.4s      // right shift ishift_x + ishift_y
    sshl v23.4s, v23.4s, v3.4s

    rshrn v22.4h, v22.4s, #1        // right shift 1
    rshrn2 v22.8h, v23.4s, #1

    uqxtn2 v24.16b, v22.8h

    subs w4, w4, #1
    st1 {v24.16b}, [x1], x2

    bgt intra_pred_bi_fill_block_w16_y

    b intra_pred_bi_end

intra_pred_bi_fill_block_w32x:

    cmp w3, #64
    beq intra_pred_bi_fill_block_w64

// intra_pred_bi_fill_block_w32

    ld1 {v18.8h, v19.8h, v20.8h, v21.8h}, [x20]     // ref_up
    ld1 {v22.8h, v23.8h, v24.8h, v25.8h}, [x15]     // up

    sxtl v6.4s, v6.4h               // 0, 1, 2, 3

    uaddl v3.4s, v4.4h, v5.4h       // ishift_x + ishift_y;
    neg v3.4s, v3.4s

intra_pred_bi_fill_block_w32_y:

    ldrsh w10, [x14], #2            // ref_le[y];
    ldrsh w11, [x13], #2            // le[y];

    lsl w10, w10, w7
    lsl w11, w11, w7
    add w10, w10, w11               // val = (ref_le[y] << ishift_y) + (le[y] << ishift_y);
    dup v16.4s, w10

    ldrsh w10, [x12], #2
    add w11, w11, w10               // add = (le[y] << ishift_y) + wy[y]
    dup v17.4s, w11

    add v18.8h, v18.8h, v22.8h      // ref_up[x] += up[x];
    add v19.8h, v19.8h, v23.8h
    add v20.8h, v20.8h, v24.8h
    add v21.8h, v21.8h, v25.8h

    mul v0.4s, v17.4s, v6.4s        // [0, 1, 2, 3] * add
    shl v1.4s, v17.4s, #2           // 4 * add
    add v2.4s, v1.4s, v0.4s

    sshll v26.4s, v18.4h, #5        // ref_up[x] << ishift_x (ishift_x = 5)
    sshll2 v27.4s, v18.8h, #5

    add v26.4s, v16.4s, v26.4s      // (ref_up[x] << ishift_x) + val
    add v27.4s, v16.4s, v27.4s

    add v26.4s, v0.4s, v26.4s       // (ref_up[x] << ishift_x) + val + add
    add v27.4s, v2.4s, v27.4s

    sshl v26.4s, v26.4s, v3.4s      // right shift ishift_x + ishift_y
    sshl v27.4s, v27.4s, v3.4s

    rshrn v26.4h, v26.4s, #1        // right shift 1
    rshrn2 v26.8h, v27.4s, #1

    uqxtn v28.8b, v26.8h

    add v0.4s, v2.4s, v1.4s
    add v2.4s, v0.4s, v1.4s

    sshll v26.4s, v19.4h, #5        // ref_up[x] << ishift_x (ishift_x = 5)
    sshll2 v27.4s, v19.8h, #5

    add v26.4s, v16.4s, v26.4s      // (ref_up[x] << ishift_x) + val
    add v27.4s, v16.4s, v27.4s

    add v26.4s, v0.4s, v26.4s       // (ref_up[x] << ishift_x) + val + add
    add v27.4s, v2.4s, v27.4s

    sshl v26.4s, v26.4s, v3.4s      // right shift ishift_x + ishift_y
    sshl v27.4s, v27.4s, v3.4s

    rshrn v26.4h, v26.4s, #1        // right shift 1
    rshrn2 v26.8h, v27.4s, #1

    uqxtn2 v28.16b, v26.8h

    add v0.4s, v2.4s, v1.4s
    add v2.4s, v0.4s, v1.4s

    sshll v26.4s, v20.4h, #5        // ref_up[x] << ishift_x (ishift_x = 5)
    sshll2 v27.4s, v20.8h, #5

    add v26.4s, v16.4s, v26.4s      // (ref_up[x] << ishift_x) + val
    add v27.4s, v16.4s, v27.4s

    add v26.4s, v0.4s, v26.4s       // (ref_up[x] << ishift_x) + val + add
    add v27.4s, v2.4s, v27.4s

    sshl v26.4s, v26.4s, v3.4s      // right shift ishift_x + ishift_y
    sshl v27.4s, v27.4s, v3.4s

    rshrn v26.4h, v26.4s, #1        // right shift 1
    rshrn2 v26.8h, v27.4s, #1

    uqxtn v29.8b, v26.8h

    add v0.4s, v2.4s, v1.4s
    add v2.4s, v0.4s, v1.4s

    sshll v26.4s, v21.4h, #5        // ref_up[x] << ishift_x (ishift_x = 5)
    sshll2 v27.4s, v21.8h, #5

    add v26.4s, v16.4s, v26.4s      // (ref_up[x] << ishift_x) + val
    add v27.4s, v16.4s, v27.4s

    add v26.4s, v0.4s, v26.4s       // (ref_up[x] << ishift_x) + val + add
    add v27.4s, v2.4s, v27.4s

    sshl v26.4s, v26.4s, v3.4s      // right shift ishift_x + ishift_y
    sshl v27.4s, v27.4s, v3.4s

    rshrn v26.4h, v26.4s, #1        // right shift 1
    rshrn2 v26.8h, v27.4s, #1

    uqxtn2 v29.16b, v26.8h

    subs w4, w4, #1
    st1 {v28.16b, v29.16b}, [x1], x2

    bgt intra_pred_bi_fill_block_w32_y

    b intra_pred_bi_end

intra_pred_bi_fill_block_w64:

    mov x9, x20

    ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x20], #64    // ref_up
    ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x20]
    ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x15], #64    // up
    ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x15]

    st1 {v8.16b, v9.16b, v10.16b, v11.16b}, [x9]        // protect registers

    sxtl v6.4s, v6.4h                   // 0, 1, 2, 3

    uaddl v4.4s, v4.4h, v5.4h           // ishift_x + ishift_y;
    neg v5.4s, v4.4s

intra_pred_bi_fill_block_w64_y:

    ldrsh w10, [x14], #2                // ref_le[y];
    ldrsh w11, [x13], #2                // le[y];

    lsl w10, w10, w7
    lsl w11, w11, w7
    add w10, w10, w11                   // val = (ref_le[y] << ishift_y) + (le[y] << ishift_y);
    dup v3.4s, w10

    ldrsh w10, [x12], #2
    add w11, w11, w10                   // add = (le[y] << ishift_y) + wy[y]
    dup v4.4s, w11

    add v16.8h, v16.8h, v24.8h          // ref_up[x] += up[x];
    add v17.8h, v17.8h, v25.8h
    add v18.8h, v18.8h, v26.8h
    add v19.8h, v19.8h, v27.8h
    add v20.8h, v20.8h, v28.8h
    add v21.8h, v21.8h, v29.8h
    add v22.8h, v22.8h, v30.8h
    add v23.8h, v23.8h, v31.8h

    mul v0.4s, v4.4s, v6.4s             // [0, 1, 2, 3] * add
    shl v1.4s, v4.4s, #2                // 4 * add
    add v2.4s, v1.4s, v0.4s

    sshll v4.4s, v16.4h, #6             // ref_up[x] << ishift_x (ishift_x = 6)
    sshll2 v7.4s, v16.8h, #6

    add v4.4s, v3.4s, v4.4s             // (ref_up[x] << ishift_x) + val
    add v7.4s, v3.4s, v7.4s

    add v4.4s, v0.4s, v4.4s             // (ref_up[x] << ishift_x) + val + add
    add v7.4s, v2.4s, v7.4s

    sshl v4.4s, v4.4s, v5.4s            // right shift ishift_x + ishift_y
    sshl v7.4s, v7.4s, v5.4s

    rshrn v4.4h, v4.4s, #1              // right shift 1
    rshrn2 v4.8h, v7.4s, #1

    uqxtn v8.8b, v4.8h

    add v0.4s, v2.4s, v1.4s
    add v2.4s, v0.4s, v1.4s

    sshll v4.4s, v17.4h, #6             // ref_up[x] << ishift_x (ishift_x = 6)
    sshll2 v7.4s, v17.8h, #6

    add v4.4s, v3.4s, v4.4s             // (ref_up[x] << ishift_x) + val
    add v7.4s, v3.4s, v7.4s

    add v4.4s, v0.4s, v4.4s             // (ref_up[x] << ishift_x) + val + add
    add v7.4s, v2.4s, v7.4s

    sshl v4.4s, v4.4s, v5.4s            // right shift ishift_x + ishift_y
    sshl v7.4s, v7.4s, v5.4s

    rshrn v4.4h, v4.4s, #1              // right shift 1
    rshrn2 v4.8h, v7.4s, #1

    uqxtn2 v8.16b, v4.8h

    add v0.4s, v2.4s, v1.4s
    add v2.4s, v0.4s, v1.4s

    sshll v4.4s, v18.4h, #6             // ref_up[x] << ishift_x (ishift_x = 6)
    sshll2 v7.4s, v18.8h, #6

    add v4.4s, v3.4s, v4.4s             // (ref_up[x] << ishift_x) + val
    add v7.4s, v3.4s, v7.4s             // (ref_up[x] << ishift_x) + val + add

    add v4.4s, v0.4s, v4.4s             // right shift ishift_x + ishift_y
    add v7.4s, v2.4s, v7.4s

    sshl v4.4s, v4.4s, v5.4s            // right shift 1
    sshl v7.4s, v7.4s, v5.4s

    rshrn v4.4h, v4.4s, #1              // ref_up[x] << ishift_x (ishift_x = 6)
    rshrn2 v4.8h, v7.4s, #1

    uqxtn v9.8b, v4.8h

    add v0.4s, v2.4s, v1.4s
    add v2.4s, v0.4s, v1.4s

    sshll v4.4s, v19.4h, #6             // ref_up[x] << ishift_x (ishift_x = 6)
    sshll2 v7.4s, v19.8h, #6

    add v4.4s, v3.4s, v4.4s             // (ref_up[x] << ishift_x) + val
    add v7.4s, v3.4s, v7.4s

    add v4.4s, v0.4s, v4.4s             // (ref_up[x] << ishift_x) + val + add
    add v7.4s, v2.4s, v7.4s

    sshl v4.4s, v4.4s, v5.4s            // right shift ishift_x + ishift_y
    sshl v7.4s, v7.4s, v5.4s

    rshrn v4.4h, v4.4s, #1              // right shift 1
    rshrn2 v4.8h, v7.4s, #1

    uqxtn2 v9.16b, v4.8h

    add v0.4s, v2.4s, v1.4s
    add v2.4s, v0.4s, v1.4s

    sshll v4.4s, v20.4h, #6             // ref_up[x] << ishift_x (ishift_x = 6)
    sshll2 v7.4s, v20.8h, #6

    add v4.4s, v3.4s, v4.4s             // (ref_up[x] << ishift_x) + val
    add v7.4s, v3.4s, v7.4s

    add v4.4s, v0.4s, v4.4s             // (ref_up[x] << ishift_x) + val + add
    add v7.4s, v2.4s, v7.4s

    sshl v4.4s, v4.4s, v5.4s            // right shift ishift_x + ishift_y
    sshl v7.4s, v7.4s, v5.4s

    rshrn v4.4h, v4.4s, #1              // right shift 1
    rshrn2 v4.8h, v7.4s, #1

    uqxtn v10.8b, v4.8h

    add v0.4s, v2.4s, v1.4s
    add v2.4s, v0.4s, v1.4s

    sshll v4.4s, v21.4h, #6             // ref_up[x] << ishift_x (ishift_x = 6)
    sshll2 v7.4s, v21.8h, #6

    add v4.4s, v3.4s, v4.4s             // (ref_up[x] << ishift_x) + val
    add v7.4s, v3.4s, v7.4s

    add v4.4s, v0.4s, v4.4s             // (ref_up[x] << ishift_x) + val + add
    add v7.4s, v2.4s, v7.4s

    sshl v4.4s, v4.4s, v5.4s            // right shift ishift_x + ishift_y
    sshl v7.4s, v7.4s, v5.4s

    rshrn v4.4h, v4.4s, #1              // right shift 1
    rshrn2 v4.8h, v7.4s, #1

    uqxtn2 v10.16b, v4.8h

    add v0.4s, v2.4s, v1.4s
    add v2.4s, v0.4s, v1.4s

    sshll v4.4s, v22.4h, #6             // ref_up[x] << ishift_x (ishift_x = 6)
    sshll2 v7.4s, v22.8h, #6

    add v4.4s, v3.4s, v4.4s             // (ref_up[x] << ishift_x) + val
    add v7.4s, v3.4s, v7.4s

    add v4.4s, v0.4s, v4.4s             // (ref_up[x] << ishift_x) + val + add
    add v7.4s, v2.4s, v7.4s

    sshl v4.4s, v4.4s, v5.4s            // right shift ishift_x + ishift_y
    sshl v7.4s, v7.4s, v5.4s

    rshrn v4.4h, v4.4s, #1              // right shift 1
    rshrn2 v4.8h, v7.4s, #1

    uqxtn v11.8b, v4.8h

    add v0.4s, v2.4s, v1.4s
    add v2.4s, v0.4s, v1.4s

    sshll v4.4s, v23.4h, #6             // ref_up[x] << ishift_x (ishift_x = 6)
    sshll2 v7.4s, v23.8h, #6

    add v4.4s, v3.4s, v4.4s             // (ref_up[x] << ishift_x) + val
    add v7.4s, v3.4s, v7.4s

    add v4.4s, v0.4s, v4.4s             // (ref_up[x] << ishift_x) + val + add
    add v7.4s, v2.4s, v7.4s

    sshl v4.4s, v4.4s, v5.4s            // right shift ishift_x + ishift_y
    sshl v7.4s, v7.4s, v5.4s

    rshrn v4.4h, v4.4s, #1              // right shift 1
    rshrn2 v4.8h, v7.4s, #1

    uqxtn2 v11.16b, v4.8h

    subs w4, w4, #1
    st1 {v8.16b, v9.16b, v10.16b, v11.16b}, [x1], x2

    bgt intra_pred_bi_fill_block_w64_y

    ld1 {v8.16b, v9.16b, v10.16b, v11.16b}, [x9]        // recovery registers

intra_pred_bi_end:
    add sp, sp, #3072

    ldp x19, x20, [sp], #16
ret

//void uavs3d_intra_pred_bi_ipf_arm64(pel *pSrc, s16 *dst, int width, int height)
//pSrc->x0, dst->x1, width->x2, height->x3
function uavs3d_intra_pred_bi_ipf_arm64
    sub sp, sp, #16
    stp x19, x20, [sp]
    mov x4, x3
    mov x3, x2
    lsl x2, x2, #1

    mov x9, #63
    clz x7, x3
    clz x8, x4
    sub x6, x9, x7                  // ishift_x = tab_log2size[width];
    sub x7, x9, x8                  // ishift_y = tab_log2size[height];

    dup v4.8h, w6
    dup v5.8h, w7
    umin v6.8h, v4.8h, v5.8h

    add x19, x0, x3                 // a = ref_up[width - 1] = pSrc[width]
    ldrb w12, [x19]
    sub x19, x0, x4                 // b = ref_le[height - 1] = pSrc[-height]
    ldrb w13, [x19]

    dup v0.8h, w12
    dup v1.8h, w13

    cmp x3, x4
    bne intra_pred_bi_ipf_width_ne_height

//intra_pred_bi_ipf_width_eq_height:
    urhadd v2.8h, v0.8h, v1.8h      // c = (a + b + 1) >> 1
    shl v2.8h, v2.8h, #1            // w = (c << 1) - a - b;
    sub v2.8h, v2.8h, v0.8h
    sub v2.8h, v2.8h, v1.8h

    b intra_pred_bi_ipf_reflines

intra_pred_bi_ipf_width_ne_height:
    cmp x6, x7
    bgt intra_pred_bi_ipf_width_gt_height
    mov x8, x6                      // ishift
    sub x9, x7, x6                  // wc
    b   intra_pred_bi_ipf_abcw
intra_pred_bi_ipf_width_gt_height:
    mov w8, w7
    sub w9, w6, w7
intra_pred_bi_ipf_abcw:
    lsl w12, w12, w6                // a << ishift_x
    lsl w13, w13, w7                // b << ishift_y
    add w12, w12, w13
    adr x14, intra_bi_tbl_wc
    add x14, x14, x9
    ldrsb w15, [x14]
    mov w10, #1
    add w11, w8, #5                 // ishift + 5
    add w8, w8, #6
    mul w12, w12, w15
    lsl w10, w10, w11               // 1 << (ishift + 5)
    add w12, w12, w10
    lsr w12, w12, w8                // c = (((a << ishift_x) + (b << ishift_y)) * 13 + (1 << (ishift + 5))) >> (ishift + 6)

    dup v2.8h, w12

    shl v2.8h, v2.8h, #1            // w = (c << 1) - a - b;
    sub v2.8h, v2.8h, v0.8h
    sub v2.8h, v2.8h, v1.8h

intra_pred_bi_ipf_reflines:

    // align (x19)
    // x19-->tmp
    mov x19, sp
    sub sp, sp, #3072

    sub x20, x19, #1024             // ref_up
    sub x15, x19, #1536             // up
    sub x14, x19, #2048             // ref_le
    sub x13, x19, #2560             // le
    sub x12, x19, #3072             // wy

    // ref_up
    add x5, x0, #1

    cmp w3, #16
    beq intra_pred_bi_ipf_refup_w16
    bgt intra_pred_bi_ipf_refup_w32x
    cmp w3, #4
    beq intra_pred_bi_ipf_refup_w4

//intra_pred_bi_ipf_refup_w8:
    ld1 {v16.8b}, [x5]
    uxtl v16.8h, v16.8b
    sub v17.8h, v1.8h, v16.8h
    st1 {v17.8h}, [x15]
    sshl v17.8h, v16.8h, v5.8h
    st1 {v17.8h}, [x20]

    b intra_pred_bi_ipf_refup_end

intra_pred_bi_ipf_refup_w4:
    ld1 {v16.8b}, [x5]
    uxtl v16.8h, v16.8b
    sub v17.4h, v1.4h, v16.4h
    st1 {v17.4h}, [x15]
    sshl v17.4h, v16.4h, v5.4h
    st1 {v17.4h}, [x20]

    b intra_pred_bi_ipf_refup_end

intra_pred_bi_ipf_refup_w16:
    ld1 {v16.16b}, [x5]
    uxtl v17.8h, v16.8b
    uxtl2 v18.8h, v16.16b
    sub v19.8h, v1.8h, v17.8h
    sub v20.8h, v1.8h, v18.8h
    st1 {v19.8h, v20.8h}, [x15]     // up
    sshl v17.8h, v17.8h, v5.8h
    sshl v18.8h, v18.8h, v5.8h
    st1 {v17.8h, v18.8h}, [x20]     // ref_up

    b intra_pred_bi_ipf_refup_end

intra_pred_bi_ipf_refup_w32x:

    cmp w3, #64
    beq intra_pred_bi_ipf_refup_w64

//intra_pred_bi_ipf_refup_w32:
    ld1 {v16.16b, v17.16b}, [x5]
    uxtl v18.8h, v16.8b
    uxtl2 v19.8h, v16.16b
    uxtl v20.8h, v17.8b
    uxtl2 v21.8h, v17.16b
    sub v22.8h, v1.8h, v18.8h
    sub v23.8h, v1.8h, v19.8h
    sub v24.8h, v1.8h, v20.8h
    sub v25.8h, v1.8h, v21.8h
    st1 {v22.8h, v23.8h, v24.8h, v25.8h}, [x15]
    sshl v22.8h, v18.8h, v5.8h
    sshl v23.8h, v19.8h, v5.8h
    sshl v24.8h, v20.8h, v5.8h
    sshl v25.8h, v21.8h, v5.8h
    st1 {v22.8h, v23.8h, v24.8h, v25.8h}, [x20]

    b intra_pred_bi_ipf_refup_end

intra_pred_bi_ipf_refup_w64:
    ld1 {v16.16b, v17.16b}, [x5], #32
    uxtl v18.8h, v16.8b
    uxtl2 v19.8h, v16.16b
    uxtl v20.8h, v17.8b
    uxtl2 v21.8h, v17.16b
    sub v22.8h, v1.8h, v18.8h
    sub v23.8h, v1.8h, v19.8h
    sub v24.8h, v1.8h, v20.8h
    sub v25.8h, v1.8h, v21.8h
    st1 {v22.8h, v23.8h, v24.8h, v25.8h}, [x15], #64
    sshl v22.8h, v18.8h, v5.8h
    sshl v23.8h, v19.8h, v5.8h
    sshl v24.8h, v20.8h, v5.8h
    sshl v25.8h, v21.8h, v5.8h
    st1 {v22.8h, v23.8h, v24.8h, v25.8h}, [x20], #64

    ld1 {v16.16b, v17.16b}, [x5]
    uxtl v18.8h, v16.8b
    uxtl2 v19.8h, v16.16b
    uxtl v20.8h, v17.8b
    uxtl2 v21.8h, v17.16b
    sub v22.8h, v1.8h, v18.8h
    sub v23.8h, v1.8h, v19.8h
    sub v24.8h, v1.8h, v20.8h
    sub v25.8h, v1.8h, v21.8h
    st1 {v22.8h, v23.8h, v24.8h, v25.8h}, [x15]
    sshl v22.8h, v18.8h, v5.8h
    sshl v23.8h, v19.8h, v5.8h
    sshl v24.8h, v20.8h, v5.8h
    sshl v25.8h, v21.8h, v5.8h
    st1 {v22.8h, v23.8h, v24.8h, v25.8h}, [x20]

    sub x15, x15, #64
    sub x20, x20, #64

intra_pred_bi_ipf_refup_end:

    // ref_le and le and wy
    cmp w4, #16
    beq intra_pred_bi_ipf_refle_h16
    bgt intra_pred_bi_ipf_refle_h32x
    cmp w4, #8
    blt intra_pred_bi_ipf_refle_h4
//intra_pred_bi_ipf_refle_h8:

    adr x19, intra_plane_coef
    add x19, x19, #8
    ld1 {v6.8b}, [x19]              // 0, 1, 2, 3, 4, 5, 6, 7
    add x19, x19, #16
    ld1 {v3.8b}, [x19]              // 7, 6, 5, 4, 3, 2, 1, 0
    sxtl v6.8h, v6.8b

    sub x5, x0, #8
    ld1 {v16.8b}, [x5]
    tbl v16.8b, {v16.16b}, v3.8b
    uxtl v16.8h, v16.8b
    sub v17.8h, v0.8h, v16.8h
    st1 {v17.8h}, [x13]             // le
    sshl v17.8h, v16.8h, v4.8h
    st1 {v17.8h}, [x14]             // ref_le
    mul v17.8h, v2.8h, v6.8h
    st1 {v17.8h}, [x12]             // wy

    b intra_pred_bi_ipf_refle_end

intra_pred_bi_ipf_refle_h4:

    adr x19, intra_plane_coef
    add x19, x19, #8
    ld1 {v6.8b}, [x19]              // 0, 1, 2, 3, 4, 5, 6, 7
    add x19, x19, #20
    ld1 {v3.s}[0], [x19]            // 3, 2, 1, 0
    sxtl v6.8h, v6.8b

    sub x5, x0, #4
    ld1 {v16.s}[0], [x5]
    tbl v16.8b, {v16.16b}, v3.8b
    uxtl v16.8h, v16.8b
    sub v17.4h, v0.4h, v16.4h
    st1 {v17.4h}, [x13]
    sshl v17.4h, v16.4h, v4.4h
    st1 {v17.4h}, [x14]
    mul v17.4h, v2.4h, v6.4h
    st1 {v17.4h}, [x12]

    b intra_pred_bi_ipf_refle_end

intra_pred_bi_ipf_refle_h16:

    adr x19, intra_plane_coef
    add x19, x19, #8
    ld1 {v6.8b}, [x19]              // 0, 1, 2, 3, 4, 5, 6, 7
    sxtl v6.8h, v6.8b
    add x19, x19, #8
    ld1 {v3.16b}, [x19]             // 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0

    sub x5, x0, #16
    ld1 {v16.16b}, [x5]
    tbl v16.16b, {v16.16b}, v3.16b
    uxtl v17.8h, v16.8b
    uxtl2 v18.8h, v16.16b
    sub v19.8h, v0.8h, v17.8h
    sub v20.8h, v0.8h, v18.8h
    st1 {v19.8h, v20.8h}, [x13]
    sshl v19.8h, v17.8h, v4.8h
    sshl v20.8h, v18.8h, v4.8h
    st1 {v19.8h, v20.8h}, [x14]
    movi v18.8h, #8
    mul v17.8h, v2.8h, v6.8h
    mul v18.8h, v18.8h, v2.8h
    add v18.8h, v18.8h, v17.8h
    st1 {v17.8h, v18.8h}, [x12]

    b intra_pred_bi_ipf_refle_end

intra_pred_bi_ipf_refle_h32x:

    cmp w4, #64
    beq intra_pred_bi_ipf_refle_h64
 
//intra_pred_bi_ipf_refle_h32:

    adr x19, intra_plane_coef
    add x19, x19, #8
    ld1 {v6.8b}, [x19]              // 0, 1, 2, 3, 4, 5, 6, 7
    sxtl v6.8h, v6.8b
    add x19, x19, #8
    ld1 {v3.16b}, [x19]             // 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0

    sub x5, x0, #32
    ld1 {v16.16b, v17.16b}, [x5]
    tbl v16.16b, {v16.16b}, v3.16b
    tbl v17.16b, {v17.16b}, v3.16b
    uxtl v18.8h, v17.8b
    uxtl2 v19.8h, v17.16b
    uxtl v20.8h, v16.8b
    uxtl2 v21.8h, v16.16b
    sub v22.8h, v0.8h, v18.8h
    sub v23.8h, v0.8h, v19.8h
    sub v24.8h, v0.8h, v20.8h
    sub v25.8h, v0.8h, v21.8h
    st1 {v22.8h, v23.8h, v24.8h, v25.8h}, [x13]
    sshl v22.8h, v18.8h, v4.8h
    sshl v23.8h, v19.8h, v4.8h
    sshl v24.8h, v20.8h, v4.8h
    sshl v25.8h, v21.8h, v4.8h
    st1 {v22.8h, v23.8h, v24.8h, v25.8h}, [x14]
    movi v17.8h, #8
    mul v17.8h, v17.8h, v2.8h
    mul v18.8h, v2.8h, v6.8h
    add v19.8h, v18.8h, v17.8h
    add v20.8h, v19.8h, v17.8h
    add v21.8h, v20.8h, v17.8h
    st1 {v18.8h, v19.8h, v20.8h, v21.8h}, [x12]

    b intra_pred_bi_ipf_refle_end

intra_pred_bi_ipf_refle_h64:

    adr x19, intra_plane_coef
    add x19, x19, #8
    ld1 {v6.8b}, [x19]              // 0, 1, 2, 3, 4, 5, 6, 7
    sxtl v6.8h, v6.8b
    add x19, x19, #8
    ld1 {v3.16b}, [x19]             // 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0

    sub x5, x0, #32
    ld1 {v16.16b, v17.16b}, [x5]
    tbl v16.16b, {v16.16b}, v3.16b
    tbl v17.16b, {v17.16b}, v3.16b
    uxtl v18.8h, v17.8b
    uxtl2 v19.8h, v17.16b
    uxtl v20.8h, v16.8b
    uxtl2 v21.8h, v16.16b
    sub v22.8h, v0.8h, v18.8h
    sub v23.8h, v0.8h, v19.8h
    sub v24.8h, v0.8h, v20.8h
    sub v25.8h, v0.8h, v21.8h
    st1 {v22.8h, v23.8h, v24.8h, v25.8h}, [x13], #64
    sshl v22.8h, v18.8h, v4.8h
    sshl v23.8h, v19.8h, v4.8h
    sshl v24.8h, v20.8h, v4.8h
    sshl v25.8h, v21.8h, v4.8h
    st1 {v22.8h, v23.8h, v24.8h, v25.8h}, [x14], #64

    sub x5, x5, #32
    ld1 {v16.16b, v17.16b}, [x5]
    tbl v16.16b, {v16.16b}, v3.16b
    tbl v17.16b, {v17.16b}, v3.16b
    uxtl v18.8h, v17.8b
    uxtl2 v19.8h, v17.16b
    uxtl v20.8h, v16.8b
    uxtl2 v21.8h, v16.16b
    sub v22.8h, v0.8h, v18.8h
    sub v23.8h, v0.8h, v19.8h
    sub v24.8h, v0.8h, v20.8h
    sub v25.8h, v0.8h, v21.8h
    st1 {v22.8h, v23.8h, v24.8h, v25.8h}, [x13]
    sshl v22.8h, v18.8h, v4.8h
    sshl v23.8h, v19.8h, v4.8h
    sshl v24.8h, v20.8h, v4.8h
    sshl v25.8h, v21.8h, v4.8h
    st1 {v22.8h, v23.8h, v24.8h, v25.8h}, [x14]

    movi v17.8h, #8
    mul v17.8h, v17.8h, v2.8h
    mul v18.8h, v2.8h, v6.8h
    add v19.8h, v18.8h, v17.8h
    add v20.8h, v19.8h, v17.8h
    add v21.8h, v20.8h, v17.8h
    add v22.8h, v21.8h, v17.8h
    add v23.8h, v22.8h, v17.8h
    add v24.8h, v23.8h, v17.8h
    add v25.8h, v24.8h, v17.8h
    st1 {v18.8h, v19.8h, v20.8h, v21.8h}, [x12], #64
    st1 {v22.8h, v23.8h, v24.8h, v25.8h}, [x12]

    sub x12, x12, #64
    sub x13, x13, #64
    sub x14, x14, #64
    
intra_pred_bi_ipf_refle_end:

    cmp w3, #16
    beq intra_pred_bi_ipf_fill_block_w16
    bgt intra_pred_bi_ipf_fill_block_w32x
    cmp w3, #8
    blt intra_pred_bi_ipf_fill_block_w4

//intra_pred_bi_ipf_fill_block_w8:

    ld1 {v18.8h}, [x20]                 // ref_up
    ld1 {v20.8h}, [x15]                 // up

    sxtl v6.4s, v6.4h                   // 0, 1, 2, 3

    uaddl v3.4s, v4.4h, v5.4h           // ishift_x + ishift_y;
    neg v3.4s, v3.4s

intra_pred_bi_ipf_fill_block_w8_y:

    add v18.8h, v18.8h, v20.8h          // ref_up[x] += up[x];

    sshll v23.4s, v18.4h, #3            // ref_up[x] << ishift_x (ishift_x = 3)
    sshll2 v24.4s, v18.8h, #3

    ldrsh w10, [x14], #2                // ref_le[y];
    ldrsh w11, [x13], #2                // le[y];

    lsl w10, w10, w7
    lsl w11, w11, w7
    add w10, w10, w11                   // (ref_le[y] << ishift_y) + (le[y] << ishift_y);
    dup v16.4s, w10

    ldrsh w10, [x12], #2
    add w11, w11, w10                   // add = (le[y] << ishift_y) + wy[y]
    dup v17.4s, w11

    mul v21.4s, v17.4s, v6.4s           // [0, 1, 2 ... 7] * add
    shl v22.4s, v17.4s, #2
    add v22.4s, v22.4s, v21.4s

    add v21.4s, v16.4s, v21.4s
    add v22.4s, v16.4s, v22.4s

    add v21.4s, v23.4s, v21.4s
    add v22.4s, v24.4s, v22.4s

    sshl v21.4s, v21.4s, v3.4s          // right shift ishift_x + ishift_y
    sshl v22.4s, v22.4s, v3.4s

    rshrn v21.4h, v21.4s, #1            // right shift 1
    rshrn2 v21.8h, v22.4s, #1

    subs w4, w4, #1
    st1 {v21.8h}, [x1], x2
    bgt intra_pred_bi_ipf_fill_block_w8_y

    b intra_pred_bi_ipf_end

intra_pred_bi_ipf_fill_block_w4:

    ld1 {v18.4h}, [x20]             // ref_up
    ld1 {v20.4h}, [x15]             // up

    sxtl v6.4s, v6.4h               // 0, 1, 2, 3

    uaddl v3.4s, v4.4h, v5.4h       // ishift_x + ishift_y;
    neg v3.4s, v3.4s

intra_pred_bi_ipf_fill_block_w4_y:

    add v18.8h, v18.8h, v20.8h      // ref_up[x] += up[x];

    sshll v23.4s, v18.4h, #2        // ref_up[x] << ishift_x (ishift_x = 2)

    ldrsh w10, [x14], #2            // ref_le[y];
    ldrsh w11, [x13], #2            // le[y];

    lsl w10, w10, w7
    lsl w11, w11, w7
    add w10, w10, w11               // (ref_le[y] << ishift_y) + (le[y] << ishift_y);
    dup v16.4s, w10

    ldrsh w10, [x12], #2
    add w11, w11, w10               // add = (le[y] << ishift_y) + wy[y]
    dup v17.4s, w11

    mul v21.4s, v17.4s, v6.4s       // [0, 1, 2, 3] * add
    add v21.4s, v16.4s, v21.4s

    add v21.4s, v23.4s, v21.4s

    sshl v21.4s, v21.4s, v3.4s      // right shift ishift_x + ishift_y
    rshrn v21.4h, v21.4s, #1        // right shift 1

    subs w4, w4, #1
    st1 {v21.4h}, [x1], x2
    bgt intra_pred_bi_ipf_fill_block_w4_y

    b intra_pred_bi_ipf_end

intra_pred_bi_ipf_fill_block_w16:

    ld1 {v18.8h, v19.8h}, [x20]     // ref_up
    ld1 {v20.8h, v21.8h}, [x15]     // up

    sxtl v6.4s, v6.4h               // 0, 1, 2, 3

    uaddl v3.4s, v4.4h, v5.4h       // ishift_x + ishift_y;
    neg v3.4s, v3.4s

intra_pred_bi_ipf_fill_block_w16_y:

    ldrsh w10, [x14], #2            // ref_le[y];
    ldrsh w11, [x13], #2            // le[y];

    lsl w10, w10, w7
    lsl w11, w11, w7
    add w10, w10, w11               // val = (ref_le[y] << ishift_y) + (le[y] << ishift_y);
    dup v16.4s, w10

    ldrsh w10, [x12], #2
    add w11, w11, w10               // add = (le[y] << ishift_y) + wy[y]
    dup v17.4s, w11

    add v18.8h, v18.8h, v20.8h      // ref_up[x] += up[x];
    add v19.8h, v19.8h, v21.8h

    mul v0.4s, v17.4s, v6.4s        // [0, 1, 2, 3] * add
    shl v1.4s, v17.4s, #2           // 4 * add
    add v2.4s, v1.4s, v0.4s

    sshll v22.4s, v18.4h, #4        // ref_up[x] << ishift_x (ishift_x = 4)
    sshll2 v23.4s, v18.8h, #4

    add v22.4s, v16.4s, v22.4s      // (ref_up[x] << ishift_x) + val
    add v23.4s, v16.4s, v23.4s

    add v22.4s, v0.4s, v22.4s       // (ref_up[x] << ishift_x) + val + add
    add v23.4s, v2.4s, v23.4s

    sshl v22.4s, v22.4s, v3.4s      // right shift ishift_x + ishift_y
    sshl v23.4s, v23.4s, v3.4s

    rshrn v24.4h, v22.4s, #1        // right shift 1
    rshrn2 v24.8h, v23.4s, #1

    add v0.4s, v2.4s, v1.4s
    add v2.4s, v0.4s, v1.4s

    sshll v22.4s, v19.4h, #4        // ref_up[x] << ishift_x (ishift_x = 4)
    sshll2 v23.4s, v19.8h, #4

    add v22.4s, v16.4s, v22.4s      // (ref_up[x] << ishift_x) + val
    add v23.4s, v16.4s, v23.4s

    add v22.4s, v0.4s, v22.4s       // (ref_up[x] << ishift_x) + val + add
    add v23.4s, v2.4s, v23.4s

    sshl v22.4s, v22.4s, v3.4s      // right shift ishift_x + ishift_y
    sshl v23.4s, v23.4s, v3.4s

    rshrn  v25.4h, v22.4s, #1        // right shift 1
    rshrn2 v25.8h, v23.4s, #1

    subs w4, w4, #1
    st1 {v24.8h, v25.8h}, [x1], x2
    bgt intra_pred_bi_ipf_fill_block_w16_y

    b intra_pred_bi_ipf_end

intra_pred_bi_ipf_fill_block_w32x:

    cmp w3, #64
    beq intra_pred_bi_ipf_fill_block_w64

// intra_pred_bi_ipf_fill_block_w32

    ld1 {v18.8h, v19.8h, v20.8h, v21.8h}, [x20]     // ref_up
    ld1 {v22.8h, v23.8h, v24.8h, v25.8h}, [x15]     // up

    sxtl v6.4s, v6.4h               // 0, 1, 2, 3

    uaddl v3.4s, v4.4h, v5.4h       // ishift_x + ishift_y;
    neg v3.4s, v3.4s

intra_pred_bi_ipf_fill_block_w32_y:

    ldrsh w10, [x14], #2            // ref_le[y];
    ldrsh w11, [x13], #2            // le[y];

    lsl w10, w10, w7
    lsl w11, w11, w7
    add w10, w10, w11               // val = (ref_le[y] << ishift_y) + (le[y] << ishift_y);
    dup v16.4s, w10

    ldrsh w10, [x12], #2
    add w11, w11, w10               // add = (le[y] << ishift_y) + wy[y]
    dup v17.4s, w11

    add v18.8h, v18.8h, v22.8h      // ref_up[x] += up[x];
    add v19.8h, v19.8h, v23.8h
    add v20.8h, v20.8h, v24.8h
    add v21.8h, v21.8h, v25.8h

    mul v0.4s, v17.4s, v6.4s        // [0, 1, 2, 3] * add
    shl v1.4s, v17.4s, #2           // 4 * add
    add v2.4s, v1.4s, v0.4s

    sshll v26.4s, v18.4h, #5        // ref_up[x] << ishift_x (ishift_x = 5)
    sshll2 v27.4s, v18.8h, #5

    add v26.4s, v16.4s, v26.4s      // (ref_up[x] << ishift_x) + val
    add v27.4s, v16.4s, v27.4s

    add v26.4s, v0.4s, v26.4s       // (ref_up[x] << ishift_x) + val + add
    add v27.4s, v2.4s, v27.4s

    sshl v26.4s, v26.4s, v3.4s      // right shift ishift_x + ishift_y
    sshl v27.4s, v27.4s, v3.4s

    rshrn v28.4h, v26.4s, #1        // right shift 1
    rshrn2 v28.8h, v27.4s, #1

    add v0.4s, v2.4s, v1.4s
    add v2.4s, v0.4s, v1.4s

    sshll v26.4s, v19.4h, #5        // ref_up[x] << ishift_x (ishift_x = 5)
    sshll2 v27.4s, v19.8h, #5

    add v26.4s, v16.4s, v26.4s      // (ref_up[x] << ishift_x) + val
    add v27.4s, v16.4s, v27.4s

    add v26.4s, v0.4s, v26.4s       // (ref_up[x] << ishift_x) + val + add
    add v27.4s, v2.4s, v27.4s

    sshl v26.4s, v26.4s, v3.4s      // right shift ishift_x + ishift_y
    sshl v27.4s, v27.4s, v3.4s

    rshrn  v29.4h, v26.4s, #1       // right shift 1
    rshrn2 v29.8h, v27.4s, #1

    add v0.4s, v2.4s, v1.4s
    add v2.4s, v0.4s, v1.4s

    sshll v26.4s, v20.4h, #5        // ref_up[x] << ishift_x (ishift_x = 5)
    sshll2 v27.4s, v20.8h, #5

    add v26.4s, v16.4s, v26.4s      // (ref_up[x] << ishift_x) + val
    add v27.4s, v16.4s, v27.4s

    add v26.4s, v0.4s, v26.4s       // (ref_up[x] << ishift_x) + val + add
    add v27.4s, v2.4s, v27.4s

    sshl v26.4s, v26.4s, v3.4s      // right shift ishift_x + ishift_y
    sshl v27.4s, v27.4s, v3.4s

    rshrn  v30.4h, v26.4s, #1       // right shift 1
    rshrn2 v30.8h, v27.4s, #1

    add v0.4s, v2.4s, v1.4s
    add v2.4s, v0.4s, v1.4s

    sshll v26.4s, v21.4h, #5        // ref_up[x] << ishift_x (ishift_x = 5)
    sshll2 v27.4s, v21.8h, #5

    add v26.4s, v16.4s, v26.4s      // (ref_up[x] << ishift_x) + val
    add v27.4s, v16.4s, v27.4s

    add v26.4s, v0.4s, v26.4s       // (ref_up[x] << ishift_x) + val + add
    add v27.4s, v2.4s, v27.4s

    sshl v26.4s, v26.4s, v3.4s      // right shift ishift_x + ishift_y
    sshl v27.4s, v27.4s, v3.4s

    rshrn  v31.4h, v26.4s, #1       // right shift 1
    rshrn2 v31.8h, v27.4s, #1

    subs w4, w4, #1
    st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x1], x2
    bgt intra_pred_bi_ipf_fill_block_w32_y

    b intra_pred_bi_ipf_end

intra_pred_bi_ipf_fill_block_w64:

    mov x9, x20

    ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x20], #64    // ref_up
    ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x20]
    ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x15], #64    // up
    ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x15]

    st1 {v8.16b, v9.16b, v10.16b, v11.16b}, [x9]        // protect registers

    sxtl v6.4s, v6.4h                   // 0, 1, 2, 3

    uaddl v4.4s, v4.4h, v5.4h           // ishift_x + ishift_y;
    neg v5.4s, v4.4s
    sub x2, x2, #64
intra_pred_bi_ipf_fill_block_w64_y:

    ldrsh w10, [x14], #2                // ref_le[y];
    ldrsh w11, [x13], #2                // le[y];

    lsl w10, w10, w7
    lsl w11, w11, w7
    add w10, w10, w11                   // val = (ref_le[y] << ishift_y) + (le[y] << ishift_y);
    dup v3.4s, w10

    ldrsh w10, [x12], #2
    add w11, w11, w10                   // add = (le[y] << ishift_y) + wy[y]
    dup v4.4s, w11

    add v16.8h, v16.8h, v24.8h          // ref_up[x] += up[x];
    add v17.8h, v17.8h, v25.8h
    add v18.8h, v18.8h, v26.8h
    add v19.8h, v19.8h, v27.8h
    add v20.8h, v20.8h, v28.8h
    add v21.8h, v21.8h, v29.8h
    add v22.8h, v22.8h, v30.8h
    add v23.8h, v23.8h, v31.8h

    mul v0.4s, v4.4s, v6.4s             // [0, 1, 2, 3] * add
    shl v1.4s, v4.4s, #2                // 4 * add
    add v2.4s, v1.4s, v0.4s

    sshll v4.4s, v16.4h, #6             // ref_up[x] << ishift_x (ishift_x = 6)
    sshll2 v7.4s, v16.8h, #6

    add v4.4s, v3.4s, v4.4s             // (ref_up[x] << ishift_x) + val
    add v7.4s, v3.4s, v7.4s

    add v4.4s, v0.4s, v4.4s             // (ref_up[x] << ishift_x) + val + add
    add v7.4s, v2.4s, v7.4s

    sshl v4.4s, v4.4s, v5.4s            // right shift ishift_x + ishift_y
    sshl v7.4s, v7.4s, v5.4s

    rshrn  v8.4h, v4.4s, #1             // right shift 1
    rshrn2 v8.8h, v7.4s, #1

    add v0.4s, v2.4s, v1.4s
    add v2.4s, v0.4s, v1.4s

    sshll v4.4s, v17.4h, #6             // ref_up[x] << ishift_x (ishift_x = 6)
    sshll2 v7.4s, v17.8h, #6

    add v4.4s, v3.4s, v4.4s             // (ref_up[x] << ishift_x) + val
    add v7.4s, v3.4s, v7.4s

    add v4.4s, v0.4s, v4.4s             // (ref_up[x] << ishift_x) + val + add
    add v7.4s, v2.4s, v7.4s

    sshl v4.4s, v4.4s, v5.4s            // right shift ishift_x + ishift_y
    sshl v7.4s, v7.4s, v5.4s

    rshrn  v9.4h, v4.4s, #1             // right shift 1
    rshrn2 v9.8h, v7.4s, #1

    add v0.4s, v2.4s, v1.4s
    add v2.4s, v0.4s, v1.4s

    sshll v4.4s, v18.4h, #6             // ref_up[x] << ishift_x (ishift_x = 6)
    sshll2 v7.4s, v18.8h, #6

    add v4.4s, v3.4s, v4.4s             // (ref_up[x] << ishift_x) + val
    add v7.4s, v3.4s, v7.4s             // (ref_up[x] << ishift_x) + val + add

    add v4.4s, v0.4s, v4.4s             // right shift ishift_x + ishift_y
    add v7.4s, v2.4s, v7.4s

    sshl v4.4s, v4.4s, v5.4s
    sshl v7.4s, v7.4s, v5.4s

    rshrn  v10.4h, v4.4s, #1            // right shift 1
    rshrn2 v10.8h, v7.4s, #1

    add v0.4s, v2.4s, v1.4s
    add v2.4s, v0.4s, v1.4s

    sshll v4.4s, v19.4h, #6             // ref_up[x] << ishift_x (ishift_x = 6)
    sshll2 v7.4s, v19.8h, #6

    add v4.4s, v3.4s, v4.4s             // (ref_up[x] << ishift_x) + val
    add v7.4s, v3.4s, v7.4s

    add v4.4s, v0.4s, v4.4s             // (ref_up[x] << ishift_x) + val + add
    add v7.4s, v2.4s, v7.4s

    sshl v4.4s, v4.4s, v5.4s            // right shift ishift_x + ishift_y
    sshl v7.4s, v7.4s, v5.4s

    rshrn  v11.4h, v4.4s, #1            // right shift 1
    rshrn2 v11.8h, v7.4s, #1

    st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [x1], #64

    add v0.4s, v2.4s, v1.4s
    add v2.4s, v0.4s, v1.4s

    sshll v4.4s, v20.4h, #6             // ref_up[x] << ishift_x (ishift_x = 6)
    sshll2 v7.4s, v20.8h, #6

    add v4.4s, v3.4s, v4.4s             // (ref_up[x] << ishift_x) + val
    add v7.4s, v3.4s, v7.4s

    add v4.4s, v0.4s, v4.4s             // (ref_up[x] << ishift_x) + val + add
    add v7.4s, v2.4s, v7.4s

    sshl v4.4s, v4.4s, v5.4s            // right shift ishift_x + ishift_y
    sshl v7.4s, v7.4s, v5.4s

    rshrn  v8.4h, v4.4s, #1             // right shift 1
    rshrn2 v8.8h, v7.4s, #1

    add v0.4s, v2.4s, v1.4s
    add v2.4s, v0.4s, v1.4s

    sshll v4.4s, v21.4h, #6             // ref_up[x] << ishift_x (ishift_x = 6)
    sshll2 v7.4s, v21.8h, #6

    add v4.4s, v3.4s, v4.4s             // (ref_up[x] << ishift_x) + val
    add v7.4s, v3.4s, v7.4s

    add v4.4s, v0.4s, v4.4s             // (ref_up[x] << ishift_x) + val + add
    add v7.4s, v2.4s, v7.4s

    sshl v4.4s, v4.4s, v5.4s            // right shift ishift_x + ishift_y
    sshl v7.4s, v7.4s, v5.4s

    rshrn  v9.4h, v4.4s, #1             // right shift 1
    rshrn2 v9.8h, v7.4s, #1

    add v0.4s, v2.4s, v1.4s
    add v2.4s, v0.4s, v1.4s

    sshll v4.4s, v22.4h, #6             // ref_up[x] << ishift_x (ishift_x = 6)
    sshll2 v7.4s, v22.8h, #6

    add v4.4s, v3.4s, v4.4s             // (ref_up[x] << ishift_x) + val
    add v7.4s, v3.4s, v7.4s

    add v4.4s, v0.4s, v4.4s             // (ref_up[x] << ishift_x) + val + add
    add v7.4s, v2.4s, v7.4s

    sshl v4.4s, v4.4s, v5.4s            // right shift ishift_x + ishift_y
    sshl v7.4s, v7.4s, v5.4s

    rshrn  v10.4h, v4.4s, #1            // right shift 1
    rshrn2 v10.8h, v7.4s, #1

    add v0.4s, v2.4s, v1.4s
    add v2.4s, v0.4s, v1.4s

    sshll v4.4s, v23.4h, #6             // ref_up[x] << ishift_x (ishift_x = 6)
    sshll2 v7.4s, v23.8h, #6

    add v4.4s, v3.4s, v4.4s             // (ref_up[x] << ishift_x) + val
    add v7.4s, v3.4s, v7.4s

    add v4.4s, v0.4s, v4.4s             // (ref_up[x] << ishift_x) + val + add
    add v7.4s, v2.4s, v7.4s

    sshl v4.4s, v4.4s, v5.4s            // right shift ishift_x + ishift_y
    sshl v7.4s, v7.4s, v5.4s

    rshrn  v11.4h, v4.4s, #1            // right shift 1
    rshrn2 v11.8h, v7.4s, #1

    subs w4, w4, #1
    st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [x1], x2
    bgt intra_pred_bi_ipf_fill_block_w64_y

    ld1 {v8.16b, v9.16b, v10.16b, v11.16b}, [x9]        // recovery registers

intra_pred_bi_ipf_end:
    add sp, sp, #3072

    ldp x19, x20, [sp], #16
    ret

//void uavs3d_intra_pred_ipf_arm64(pel *src, pel *dst, int i_dst, int flt_range_hor, int flt_range_ver,
//    const s8 *flt_coef_hor, const s8 *flt_coef_ver, int w, int h, int bit_depth)
//src->x0, dst->x1, i_dst->x2, flt_range_hor->x3, flt_range_ver->x4, flt_coef_hor->x5, flt_coef_ver->x6, w->x7
function uavs3d_intra_pred_ipf_arm64
#if defined(__APPLE__)
    ldr     w8, [sp]
    ldr     w9, [sp, #4]
#else
    ldr     w8, [sp]                // w8 = h
    ldr     w9, [sp, #8]            // w9 = bit_depth
#endif
    add x0, x0, #1                  // p_top = src + 1

    cmp w7, #8
    beq intra_pred_ipf_w8
    bgt intra_pred_ipf_w16x

intra_pred_ipf_w4:
    mov x10, #0                     // row = 0
    cmp w3, #0                      // flt_range_hor == 0
    beq intra_pred_ipf_w4_ver
    
    movi v0.8b, #64
    movi v1.4h, #64
    movi v2.4s, #0

    ld1 {v6.8b}, [x5]               // coef_left = flt_coef_hor[col]
    ld1 {v7.8b}, [x0]               // pix_top = p_top[col]

    mov x12, #-2
    add x12, x0, x12                // &src[-row-1]

intra_pred_ipf_w4_flt_ver_hor:
    cmp w10, w4                     // row < flt_range_ver ?
    bge intra_pred_ipf_w4_flt_hor
    ldrb w14, [x6]                   // coef_top = flt_coef_ver[row]
    dup v3.4h, w14
    dup v4.8b, w14
    ldr w15, [x12]                  // pix_left = src[-row-1]
    ld1 {v16.8b}, [x1]              // pix_cur = dst[col]
    dup v5.8b, w15                  // pix_left

    ssubl v17.8h, v0.8b, v6.8b      // 64 - coef_left
    sub   v17.4h, v17.4h, v3.4h     // coef_cur = 64 - coef_left - coef_top

    uxtl  v16.8h, v16.8b
    mul   v16.4h, v16.4h, v17.4h    // coef_cur*pix_cur
    umull v18.8h, v7.8b, v4.8b      // coef_top*pix_top
    umull v19.8h, v5.8b, v6.8b      // coef_left*pix_left

    add   v16.4h, v16.4h, v18.4h    //
    add   v16.4h, v16.4h, v19.4h
    sqrshrun v16.8b, v16.8h, #6     // pix >> 6

    add w10, w10, #1                // row += 1
    add x6, x6, #1
    sub x12, x12, #1
    st1 {v16.s}[0], [x1], x2
    b intra_pred_ipf_w4_flt_ver_hor

intra_pred_ipf_w4_flt_hor:
    cmp w10, w8
    bge intra_pred_ipf_end          // if (row >= h) end
    ldrb w15, [x12]                 // pix_left = src[-row-1]
    ld1 {v4.8b}, [x1]               // pix_cur = dst[col]
    dup v5.8b, w15                  // pix_left
    sub v3.8b, v0.8b, v6.8b         // coef_cur = 64 - coef_left
    umull v5.8h, v5.8b, v6.8b       // coef_left*pix_left
    umull v3.8h, v3.8b, v4.8b       // coef_cur*pix_cur
    add v3.4h, v3.4h, v5.4h

    add w10, w10, #1
    sub x12, x12, #1
    sqrshrun v3.8b, v3.8h, #6       // pix >> 6
    st1 {v3.s}[0], [x1], x2
    b intra_pred_ipf_w4_flt_hor

intra_pred_ipf_w4_ver:
    mov x13, #64
    ld1 {v3.8b}, [x0]               // pix_top = p_top[col]
intra_pred_ipf_w4_ver_y:
    cmp w10, w4
    bge intra_pred_ipf_end          // if (row >= flt_range_ver) end
    
    ldrb w14, [x6]                  // coef_top = flt_coef_ver[row]
    sub w12, w13, w14               // coef_cur = 64 - coef_top
    ld1 {v4.8b}, [x1]               // pix_cur
    dup v5.8b, w14                  // coef_top
    dup v6.8b, w12                  // coef_cur

    umull v5.8h, v3.8b, v5.8b
    umull v6.8h, v4.8b, v6.8b
    add v5.4h, v5.4h, v6.4h
    sqrshrun v5.8b, v5.8h, #6
    add w10, w10, #1
    add x6, x6, #1
    st1 {v5.s}[0], [x1], x2
    b intra_pred_ipf_w4_ver_y

intra_pred_ipf_w8:
    mov x10, #0                     // row = 0
    cmp w3, #0                      // flt_range_hor == 0
    beq intra_pred_ipf_w8_ver
    
    movi v0.8b, #64
    movi v1.8h, #64
    movi v2.4s, #0

    ld1 {v6.8b}, [x5]               // coef_left = flt_coef_hor[col]
    ld1 {v7.8b}, [x0]               // pix_top = p_top[col]

    mov x12, #-2
    add x12, x0, x12                // &src[-row-1]

intra_pred_ipf_w8_flt_ver_hor:
    cmp w10, w4                     // row < flt_range_ver ?
    bge intra_pred_ipf_w8_flt_hor
    ldrb w14, [x6]                  // coef_top = flt_coef_ver[row]
    dup v3.8h, w14
    dup v4.8b, w14
    ldr w15, [x12]                  // pix_left = src[-row-1]
    ld1 {v16.8b}, [x1]              // pix_cur = dst[col]
    dup v5.8b, w15                  // pix_left

    ssubl v17.8h, v0.8b, v6.8b      // 64 - coef_left
    uxtl  v16.8h, v16.8b
    sub   v17.8h, v17.8h, v3.8h     // coef_cur = 64 - coef_left - coef_top
    umull v18.8h, v7.8b, v4.8b      // coef_top*pix_top
    umull v19.8h, v5.8b, v6.8b      // coef_left*pix_left
    mul   v16.8h, v16.8h, v17.8h    // coef_cur*pix_cur

    add   v18.8h, v18.8h, v19.8h    //
    add   v16.8h, v16.8h, v18.8h
    sqrshrun v16.8b, v16.8h, #6     // pix >> 6

    add w10, w10, #1                // row += 1
    add x6, x6, #1
    sub x12, x12, #1
    st1 {v16.8b}, [x1], x2
    b intra_pred_ipf_w8_flt_ver_hor

intra_pred_ipf_w8_flt_hor:
    cmp w10, w8
    bge intra_pred_ipf_end          // if (row >= h) end
    ldrb w15, [x12]                 // pix_left = src[-row-1]
    ld1 {v4.8b}, [x1]               // pix_cur = dst[col]
    dup v5.8b, w15                  // pix_left
    sub v3.8b, v0.8b, v6.8b         // coef_cur = 64 - coef_left
    umull v5.8h, v5.8b, v6.8b       // coef_left*pix_left
    umull v3.8h, v3.8b, v4.8b       // coef_cur*pix_cur
    add v3.8h, v3.8h, v5.8h

    add w10, w10, #1
    sub x12, x12, #1
    sqrshrun v3.8b, v3.8h, #6       // pix >> 6
    st1 {v3.8b}, [x1], x2
    b intra_pred_ipf_w8_flt_hor

intra_pred_ipf_w8_ver:
    mov x13, #64
    ld1 {v3.8b}, [x0]               // pix_top = p_top[col]
intra_pred_ipf_w8_ver_y:
    cmp w10, w4
    bge intra_pred_ipf_end          // if (row >= flt_range_ver) end
    
    ldrb w14, [x6]                  // coef_top = flt_coef_ver[row]
    sub w12, w13, w14               // coef_cur = 64 - coef_top
    ld1 {v4.8b}, [x1]               // pix_cur
    dup v5.8b, w14                  // coef_top
    dup v6.8b, w12                  // coef_cur

    umull v5.8h, v3.8b, v5.8b
    umull v6.8h, v4.8b, v6.8b
    add v5.8h, v5.8h, v6.8h
    sqrshrun v5.8b, v5.8h, #6
    add w10, w10, #1
    add x6, x6, #1
    st1 {v5.8b}, [x1], x2
    b intra_pred_ipf_w8_ver_y

intra_pred_ipf_w16x:
    mov x10, #0                     // row = 0
    cmp w3, #0                      // flt_range_hor == 0
    beq intra_pred_ipf_w16x_ver
    
    movi v0.16b, #64
    ld1 {v1.16b}, [x5]              // coef_left = flt_coef_hor[col]
    movi v2.4s, #0

    mov x12, #-2
    add x12, x0, x12                // &src[-row-1]
    sub x2, x2, x7                  // i_dst - w
intra_pred_ipf_w16x_flt_ver_hor:
    mov x13, #64
    cmp w10, w4                     // row < flt_range_ver ?
    bge intra_pred_ipf_w16x_flt_hor
    ldrb w14, [x6]                  // coef_top = flt_coef_ver[row]
    ldrb w15, [x12]                 // pix_left = src[-row-1]
    sub w11, w13, w14               // coef_tmp = 64 - flt_coef_ver[row]
    dup v3.16b, w14                 // coef_top
    dup v4.16b, w11                 // coef_tmp

    ld1 {v16.16b}, [x1]             // pix_cur = dst[col]
    ld1 {v7.16b}, [x0], #16         // pix_top = p_top[col]
    dup v5.16b, w15                 // pix_left
    ssubl  v24.8h, v4.8b, v1.8b     // coef_cur = 64 - coef_left - coef_top
    ssubl2 v25.8h, v4.16b, v1.16b
    uxtl  v26.8h, v16.8b            // pix_cur
    uxtl2 v27.8h, v16.16b

    umull  v18.8h, v3.8b, v7.8b     // coef_top*pix_top
    umull2 v22.8h, v3.16b, v7.16b
    umull  v19.8h, v1.8b, v5.8b     // coef_left*pix_left
    umull2 v23.8h, v1.16b, v5.16b
    mul    v20.8h, v24.8h, v26.8h   // coef_cur*pix_cur
    mul    v21.8h, v25.8h, v27.8h

    add   v18.8h, v18.8h, v19.8h    //
    add   v22.8h, v22.8h, v23.8h
    add   v18.8h, v20.8h, v18.8h
    add   v22.8h, v21.8h, v22.8h
    sqrshrun v18.8b, v18.8h, #6     // pix >> 6
    sqrshrun v19.8b, v22.8h, #6
    st1 {v18.8b, v19.8b}, [x1], #16

    mov x11, #16                    // for(col = 16; col < w; col++)
intra_pred_ipf_w16x_flt_ver:
    cmp w11, w7
    bge intra_pred_ipf_w16x_flt_row_end
    ld1 {v6.16b}, [x1]              // pix_cur = dst[col]
    ld1 {v7.16b}, [x0], #16         // pix_top = p_top[col]

    umull  v16.8h, v3.8b, v7.8b     // coef_top*pix_top
    umull2 v17.8h, v3.16b, v7.16b
    umull  v18.8h, v4.8b, v6.8b     // coef_tmp*pix_cur
    umull2 v19.8h, v4.16b, v6.16b
    add    v16.8h, v16.8h, v18.8h
    add    v17.8h, v17.8h, v19.8h
    sqrshrun v16.8b, v16.8h, #6     // pix >> 6
    sqrshrun v17.8b, v17.8h, #6
    st1 {v16.8b, v17.8b}, [x1], #16
    add w11, w11, #16
    b intra_pred_ipf_w16x_flt_ver

intra_pred_ipf_w16x_flt_row_end:
    add w10, w10, #1                // row += 1
    add x6, x6, #1
    sub x12, x12, #1
    sub x0, x0, x7
    add x1, x1, x2
    b intra_pred_ipf_w16x_flt_ver_hor

intra_pred_ipf_w16x_flt_hor:
    add x2, x2, x7
intra_pred_ipf_w16x_flt_hor_loop:
    cmp w10, w8
    bge intra_pred_ipf_end          // if (row >= h) end
    ldrb w15, [x12]                 // pix_left = src[-row-1]
    ld1 {v4.16b}, [x1]              // pix_cur = dst[col]
    dup v5.16b, w15                 // pix_left
    sub v3.16b, v0.16b, v1.16b      // coef_cur = 64 - coef_left
    umull  v16.8h, v1.8b, v5.8b     // coef_left*pix_left
    umull2 v17.8h, v1.16b, v5.16b
    umull  v18.8h, v3.8b, v4.8b     // coef_cur*pix_cur
    umull2 v19.8h, v3.16b, v4.16b

    add v16.8h, v16.8h, v18.8h
    add v17.8h, v17.8h, v19.8h
    sqrshrun v16.8b, v16.8h, #6     // pix >> 6
    sqrshrun v17.8b, v17.8h, #6

    add w10, w10, #1
    sub x12, x12, #1
    st1 {v16.8b, v17.8b}, [x1], x2

    b intra_pred_ipf_w16x_flt_hor_loop

intra_pred_ipf_w16x_ver:
    mov x13, #64
    sub x2, x2, x7
intra_pred_ipf_w16x_ver_y:
    cmp w10, w4
    bge intra_pred_ipf_end          // if (row >= flt_range_ver) end
    mov x11, #0
intra_pred_ipf_w16x_ver_x:
    ldrb w14, [x6]                  // coef_top = flt_coef_ver[row]
    sub w12, w13, w14               // coef_cur = 64 - coef_top
    ld1 {v3.16b}, [x0], #16         // pix_top = p_top[col]
    ld1 {v4.16b}, [x1]              // pix_cur = dst[col]
    dup v5.16b, w14                 // coef_top
    dup v6.16b, w12                 // coef_cur

    umull  v16.8h, v3.8b, v5.8b
    umull2 v17.8h, v3.16b, v5.16b
    umull  v18.8h, v4.8b, v6.8b
    umull2 v19.8h, v4.16b, v6.16b

    add v16.8h, v16.8h, v18.8h
    add v17.8h, v17.8h, v19.8h
    sqrshrun v16.8b, v16.8h, #6     // pix >> 6
    sqrshrun v17.8b, v17.8h, #6
    
    add w11, w11, #16
    st1 {v16.8b, v17.8b}, [x1], #16
    cmp w11, w7
    blt intra_pred_ipf_w16x_ver_x

    add w10, w10, #1
    add x6, x6, #1
    sub x0, x0, x7
    add x1, x1, x2

    b intra_pred_ipf_w16x_ver_y

intra_pred_ipf_end:
    ret

#endif


